In [5]:
from pathlib import Path

base_data_path = Path('Dataset-textonly')
base_data_path1 = Path('Dataset-afd-textonly')



In [6]:
from mamkit.data.datasets import MMUSEDFallacy, InputMode

In [7]:
#to Download MMUSEDFallacy data

mm_used_fallacy_loader = MMUSEDFallacy(
    task_name='afc',              
    input_mode=InputMode.TEXT_ONLY, 
    base_data_path=base_data_path)

In [8]:

mm_used_fallacy_loader = MMUSEDFallacy(
    task_name='afd',  
    input_mode=InputMode.TEXT_ONLY,  
    base_data_path=base_data_path1
)



In [9]:
import pickle

# Load dataset.pkl
pkl_path = "Dataset-afd-textonly\MMUSED-fallacy\dataset.pkl"
with open(pkl_path, "rb") as f:
    dataset = pickle.load(f)

# Print keys and sample data
print(f"Keys: {dataset.keys()}")
print(f"Sample: {dataset[list(dataset.keys())[0]][:2]}")  # Print first 2 entries


Keys: Index(['filename', 'dialogue_id', 'fallacy', 'dialogue_start_time',
       'dialogue_end_time', 'dialogue_indexes', 'dialogue_sentences',
       'dialogue_tokens', 'dialogue_whisper_indexes', 'dialogue',
       'snippet_start_time', 'snippet_end_time', 'snippet_indexes',
       'snippet_sentences', 'snippet_tokens', 'snippet_whisper_indexes',
       'snippet'],
      dtype='object')
Sample: 0    1984_07Oct_1
1    1984_07Oct_1
Name: filename, dtype: object


In [10]:
print(dataset["fallacy"].value_counts())  # Check distribution of fallacy labels
print(dataset[["snippet_sentences", "fallacy"]].head())  # View sample text & labels


fallacy
AppealtoEmotion      800
AppealtoAuthority    191
AdHominem            149
FalseCause            56
Slipperyslope         46
Slogans               36
Name: count, dtype: int64
                                   snippet_sentences            fallacy
0  [And there are other ways of squeezing this bu...    AppealtoEmotion
1  [And you let those people go with the guidelin...    AppealtoEmotion
2  [In mine, I happen to believe in the people an...  AppealtoAuthority
3  [That's why faith in the United States is pure...    AppealtoEmotion
4  [That's why faith in the United States is pure...    AppealtoEmotion


In [11]:
import pandas as pd

df = dataset  

# Keep only relevant columns
df = df[["snippet_sentences", "fallacy"]]

# Convert list of sentences to a single string
df["snippet_sentences"] = df["snippet_sentences"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)

# Save cleaned dataset
df.to_csv("cleaned_data/fallacy_dataset.csv", index=False)

print(df.head())


                                   snippet_sentences            fallacy
0  And there are other ways of squeezing this bud...    AppealtoEmotion
1  And you let those people go with the guideline...    AppealtoEmotion
2  In mine, I happen to believe in the people and...  AppealtoAuthority
3  That's why faith in the United States is pure ...    AppealtoEmotion
4  That's why faith in the United States is pure ...    AppealtoEmotion


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["snippet_sentences"] = df["snippet_sentences"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)


In [12]:
df.loc[:, "snippet_sentences"] = df["snippet_sentences"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)


In [13]:
df.loc[:, "binary_label"] = df["fallacy"].apply(lambda x: "Fallacy" if pd.notna(x) else "No Fallacy")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "binary_label"] = df["fallacy"].apply(lambda x: "Fallacy" if pd.notna(x) else "No Fallacy")


In [14]:
print(df["binary_label"].value_counts())


binary_label
Fallacy    1278
Name: count, dtype: int64


In [15]:
print(df["fallacy"].isna().sum())  # Count of NaN values
print(df["fallacy"].unique())      # Check unique values


0
['AppealtoEmotion' 'AppealtoAuthority' 'FalseCause' 'Slogans' 'AdHominem'
 'Slipperyslope']


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df["snippet_sentences"])


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode fallacy labels
label_encoder = LabelEncoder()
df.loc[:, "fallacy_label"] = label_encoder.fit_transform(df["fallacy"])

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")
X = vectorizer.fit_transform(df["snippet_sentences"])
y = df["fallacy_label"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "fallacy_label"] = label_encoder.fit_transform(df["fallacy"])


# Using LR


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the classifier
model = LogisticRegression(multi_class="ovr", solver="liblinear")
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.62890625
                   precision    recall  f1-score   support

        AdHominem       0.00      0.00      0.00        30
AppealtoAuthority       1.00      0.03      0.05        39
  AppealtoEmotion       0.63      1.00      0.77       160
       FalseCause       0.00      0.00      0.00        11
    Slipperyslope       0.00      0.00      0.00         9
          Slogans       0.00      0.00      0.00         7

         accuracy                           0.63       256
        macro avg       0.27      0.17      0.14       256
     weighted avg       0.54      0.63      0.49       256



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
import joblib

# Save the trained model
joblib.dump(model, './LR_model/logistic_regression_model.pkl')


['./LR_model/logistic_regression_model.pkl']

# Using BERT

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

#  Load Data
df = pd.read_csv("Cleaned_data/fallacy_dataset.csv")  
df["text"] = df["snippet_sentences"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)  # Convert lists to strings
df["label"] = df["fallacy"].astype("category").cat.codes  # Convert fallacy names to numeric labels

#  Train-Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

#  Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#  Tokenize Texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

#  Convert to Hugging Face Dataset Format
train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"], "labels": train_labels.tolist()})
test_dataset = Dataset.from_dict({"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"], "labels": test_labels.tolist()})

#  Load Pretrained BERT Model for Classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=df["label"].nunique())

#  Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
)

#  Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

#  Train the Model
trainer.train()

#  Evaluate the Model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

#  Print Results
print("Accuracy:", accuracy_score(test_labels, preds))
print("Classification Report:\n", classification_report(test_labels, preds, target_names=df["fallacy"].unique()))


In [None]:
model.save_pretrained("./bert_fallacy_model")
tokenizer.save_pretrained("./bert_fallacy_model")
