In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Step 1: Load the Straw Man dataset
df = pd.read_csv("straw_man_fallacy_dataset.csv")  # Update path if needed
texts = df["text"].tolist()
labels_raw = df["label"].tolist()

# Step 2: Encode string labels to numeric
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_raw)  # e.g., {'healthy_argument': 0, 'non_argument': 1, 'straw_man': 2}

# Step 3: Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]
labels_tensor = torch.tensor(labels)

# Step 4: Prepare dataset and loaders
dataset = TensorDataset(input_ids, attention_mask, labels_tensor)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = DataLoader(test_data, batch_size=8)

# Step 5: Load BERT model
num_labels = len(set(labels))  # should be 3
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Step 6: Train the model
model.train()
for epoch in range(3):
    print(f"Epoch {epoch+1}/3")
    for batch in train_loader:
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Step 7: Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.tolist())
        true_labels.extend(b_labels.tolist())

# Step 8: Generate classification report
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Label mapping: {'healthy_argument': 0, 'non_argument': 1, 'straw_man': 2}
                  precision    recall  f1-score   support

healthy_argument       1.00      1.00      1.00        31
    non_argument       1.00      1.00      1.00        31
       straw_man       1.00      1.00      1.00        47

        accuracy                           1.00       109
       macro avg       1.00      1.00      1.00       109
    weighted avg       1.00      1.00      1.00       109



In [2]:
def predict_strawman(text):
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    return label_encoder.inverse_transform([prediction])[0]

# Example usage
print(predict_strawman("You think we should relax the school dress code? So you’re fine with students showing up in pajamas!"))


straw_man
