In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Step 1: Load dataset
df = pd.read_csv("stacking_deck_fallacy_dataset.csv")  # Replace with actual path if needed
texts = df["text"].values
labels_raw = df["label"].values

# Step 2: Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_raw)  # Converts to 0, 1, 2

# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(texts)

# Step 4: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Step 5: Train Decision Tree
clf = DecisionTreeClassifier(max_depth=10, random_state=42)
clf.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = clf.predict(X_test)
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Label mapping: {'healthy_argument': 0, 'non_argument': 1, 'stacking_deck': 2}
                  precision    recall  f1-score   support

healthy_argument       1.00      1.00      1.00        33
    non_argument       1.00      1.00      1.00        21
   stacking_deck       1.00      1.00      1.00        30

        accuracy                           1.00        84
       macro avg       1.00      1.00      1.00        84
    weighted avg       1.00      1.00      1.00        84



In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Step 1: Load dataset
df = pd.read_csv("youtube_fallacy_dataset.csv")
texts = df["text"].tolist()
labels_raw = df["label"].tolist()

# Step 2: Encode string labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_raw)
class_names = label_encoder.classes_
all_encoded_labels = list(label_encoder.transform(class_names))

# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X = vectorizer.fit_transform(texts)

# Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.3, random_state=42, stratify=labels
)

# Step 5: Train Decision Tree
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)

# Step 6: Predict & Evaluate
y_pred = clf.predict(X_test)

print("Label mapping:", dict(zip(class_names, all_encoded_labels)))
print(classification_report(y_test, y_pred, labels=all_encoded_labels, target_names=class_names))


ValueError: The test_size = 3 should be greater or equal to the number of classes = 5