
# OpSpam Replication: RoBERTa(+LSTM) **and** TF‑IDF + Logistic Baseline

This notebook is pre-configured for your CSV with columns:
- **`text`**: review text
- **`deceptive`**: binary label (1 = deceptive, 0 = truthful)

It trains **two models** and reports comparable metrics/plots:
1. **Baseline:** TF‑IDF + Logistic Regression (mirrors your demo)
2. **RoBERTa (+ optional LSTM)** classifier

Exports: metrics CSVs, confusion matrix, ROC and PR curves, and a side‑by‑side table.


# Setup

In [None]:

#@title ⬇️ Install dependencies
!pip -q install transformers==4.44.2 datasets==2.21.0 accelerate==0.34.2 scikit-learn==1.5.1 matplotlib==3.9.0 torch==2.3.1 -U

import os, random, math, json, sys, time, re
from dataclasses import dataclass
from typing import Optional, Dict, Any

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, average_precision_score,
    classification_report, roc_curve, precision_recall_curve
)

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from transformers import (
    AutoTokenizer, AutoModel, AutoConfig,
    TrainingArguments, Trainer
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)


# Configuration

In [None]:

#@title ⚙️ Configuration
MODEL_NAME = "roberta-base"  #@param ["roberta-base"]
MAX_LEN = 256  #@param {type:"integer"}
BATCH_SIZE = 32  #@param {type:"integer"}
EPOCHS = 5  #@param {type:"integer"}
LEARNING_RATE = 2e-5  #@param {type:"number"}
WEIGHT_DECAY = 0.01  #@param {type:"number"}
SEED = 42  #@param {type:"integer"}
USE_LSTM = True  #@param {type:"boolean"}
LSTM_HIDDEN = 128  #@param {type:"integer"}
DROPOUT = 0.6  #@param {type:"number"}

# Baseline config
TFIDF_MAX_FEATURES = 5000  #@param {type:"integer"}

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(SEED)


# Data

In [None]:

#@title 📥 Load data
csv_path = ""  #@param {type:"string"}

if not csv_path or not os.path.exists(csv_path):
    try:
        from google.colab import files  # type: ignore
        print("Upload 'deceptive-opinion.csv'")
        uploaded = files.upload()
        csv_path = list(uploaded.keys())[0]
    except Exception as e:
        print("No Colab file picker. Falling back to default name.")
        csv_path = "deceptive-opinion.csv"

assert os.path.exists(csv_path), f"CSV not found at {csv_path}"
df = pd.read_csv(csv_path)
print("Columns:", df.columns.tolist())
display(df.head(3))


# Columns & Quick Clean

In [None]:

#@title 🔎 Normalize expected columns
TEXT_COL = "text"      # fixed per your demo
LABEL_COL = "deceptive"  # fixed per your demo

assert TEXT_COL in df.columns, f"'{TEXT_COL}' not found"
assert LABEL_COL in df.columns, f"'{LABEL_COL}' not found"

# Ensure labels are 0/1 integers
if df[LABEL_COL].dtype != int and df[LABEL_COL].dtype != np.int64:
    df[LABEL_COL] = pd.to_numeric(df[LABEL_COL], errors="coerce").astype("Int64").fillna(0).astype(int)

# Quick text clean like your demo
df[TEXT_COL] = df[TEXT_COL].astype(str).str.lower().str.replace(r"[^\w\s]", "", regex=True)

print("Label distribution:", df[LABEL_COL].value_counts().to_dict())


# Split

In [None]:

#@title 🔀 Stratified split (80/20) and make a val split from train
X = df[TEXT_COL].values
y = df[LABEL_COL].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=SEED, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.10, random_state=SEED, stratify=y_train
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


# Baseline (TF‑IDF + Logistic)

In [None]:

#@title 🧪 Baseline: TF‑IDF + Logistic Regression
tfidf = TfidfVectorizer(stop_words='english', max_features=TFIDF_MAX_FEATURES)
X_train_t = tfidf.fit_transform(X_train)
X_val_t   = tfidf.transform(X_val)
X_test_t  = tfidf.transform(X_test)

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train_t, y_train)

y_prob_lr = logreg.predict_proba(X_test_t)[:,1]
y_pred_lr = (y_prob_lr >= 0.5).astype(int)

acc_lr  = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr, zero_division=0)
rec_lr  = recall_score(y_test, y_pred_lr, zero_division=0)
f1_lr   = f1_score(y_test, y_pred_lr, zero_division=0)
auroc_lr = roc_auc_score(y_test, y_prob_lr)
ap_lr    = average_precision_score(y_test, y_prob_lr)

print("=== TF‑IDF + Logistic (Test) ===")
print(f"Accuracy : {acc_lr:.4f}")
print(f"Precision: {prec_lr:.4f}")
print(f"Recall   : {rec_lr:.4f}")
print(f"F1       : {f1_lr:.4f}")
print(f"AUROC    : {auroc_lr:.4f}")
print(f"AP       : {ap_lr:.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred_lr, digits=4))

# Plots
fpr, tpr, _ = roc_curve(y_test, y_prob_lr)
plt.figure()
plt.plot(fpr, tpr, label=f"AUROC = {auroc_lr:.3f}")
plt.plot([0,1],[0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — TF‑IDF + Logistic")
plt.legend(loc="lower right"); plt.grid(True, linestyle=":"); plt.tight_layout()
plt.savefig("roc_curve_logreg.png", dpi=160); plt.show()

precisions, recalls, _ = precision_recall_curve(y_test, y_prob_lr)
plt.figure()
plt.plot(recalls, precisions, label=f"AP = {ap_lr:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("PR Curve — TF‑IDF + Logistic")
plt.legend(loc="lower left"); plt.grid(True, linestyle=":"); plt.tight_layout()
plt.savefig("pr_curve_logreg.png", dpi=160); plt.show()

cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure()
plt.imshow(cm_lr, interpolation='nearest')
plt.title("Confusion Matrix — TF‑IDF + Logistic")
plt.colorbar()
ticks = np.arange(2)
plt.xticks(ticks, ["Truthful (0)","Deceptive (1)"])
plt.yticks(ticks, ["Truthful (0)","Deceptive (1)"])
for i in range(cm_lr.shape[0]):
    for j in range(cm_lr.shape[1]):
        plt.text(j, i, format(cm_lr[i, j], 'd'), ha="center", va="center")
plt.ylabel('True'); plt.xlabel('Predicted'); plt.tight_layout()
plt.savefig("cm_logreg.png", dpi=160); plt.show()


# Tokenizer & Datasets

In [None]:

#@title 🔡 Tokenize for RoBERTa
from transformers import AutoTokenizer, AutoModel, AutoConfig
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts; self.labels = labels
        self.tokenizer = tokenizer; self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]), padding="max_length", truncation=True,
            max_length=self.max_len, return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item

train_ds = TextDataset(X_train, y_train, tokenizer, MAX_LEN)
val_ds   = TextDataset(X_val, y_val, tokenizer, MAX_LEN)
test_ds  = TextDataset(X_test, y_test, tokenizer, MAX_LEN)


# Model

In [None]:

#@title 🧠 RoBERTa(+LSTM) model
class RobertaLSTMClassifier(nn.Module):
    def __init__(self, model_name, lstm_hidden=128, dropout=0.6, use_lstm=True):
        super().__init__()
        self.use_lstm = use_lstm
        self.config = AutoConfig.from_pretrained(model_name)
        self.roberta = AutoModel.from_pretrained(model_name, add_pooling_layer=False)
        hidden = self.config.hidden_size
        if self.use_lstm:
            self.lstm = nn.LSTM(input_size=hidden, hidden_size=lstm_hidden,
                                num_layers=1, batch_first=True, bidirectional=False)
            feat = lstm_hidden
        else:
            feat = hidden
        self.bn = nn.BatchNorm1d(feat)
        self.drop = nn.Dropout(dropout)
        self.cls = nn.Linear(feat, 2)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state
        if self.use_lstm:
            _, (h, _) = self.lstm(last)
            feat = h.squeeze(0)
        else:
            feat = last[:,0,:]
        feat = self.bn(feat)
        feat = self.drop(feat)
        logits = self.cls(feat)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"loss": loss, "logits": logits}


# Train (RoBERTa)

In [None]:

#@title 🚀 Train RoBERTa(+LSTM)
from transformers import TrainingArguments, Trainer

model = RobertaLSTMClassifier(MODEL_NAME, LSTM_HIDDEN, DROPOUT, USE_LSTM)

def compute_metrics_fn(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    probs = torch.softmax(torch.tensor(preds), dim=-1).numpy()[:,1]
    y_pred = (probs >= 0.5).astype(int)
    y_true = p.label_ids
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "auroc": roc_auc_score(y_true, probs),
        "ap": average_precision_score(y_true, probs),
    }

args = TrainingArguments(
    output_dir="outputs_roberta",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=50,
    report_to=[], seed=SEED
)

trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=compute_metrics_fn)
trainer.train()


# Evaluate (RoBERTa)

In [None]:

#@title 📊 Evaluate RoBERTa(+LSTM) & plot
pred = trainer.predict(test_ds)
probs_rb = torch.softmax(torch.tensor(pred.predictions), dim=-1).numpy()[:,1]
y_true = pred.label_ids
y_pred_rb = (probs_rb >= 0.5).astype(int)

acc_rb  = accuracy_score(y_true, y_pred_rb)
prec_rb = precision_score(y_true, y_pred_rb, zero_division=0)
rec_rb  = recall_score(y_true, y_pred_rb, zero_division=0)
f1_rb   = f1_score(y_true, y_pred_rb, zero_division=0)
auroc_rb = roc_auc_score(y_true, probs_rb)
ap_rb    = average_precision_score(y_true, probs_rb)

print("=== RoBERTa(+LSTM) (Test) ===")
print(f"Accuracy : {acc_rb:.4f}")
print(f"Precision: {prec_rb:.4f}")
print(f"Recall   : {rec_rb:.4f}")
print(f"F1       : {f1_rb:.4f}")
print(f"AUROC    : {auroc_rb:.4f}")
print(f"AP       : {ap_rb:.4f}")
print("\nClassification report:\n", classification_report(y_true, y_pred_rb, digits=4))

# ROC
fpr_rb, tpr_rb, _ = roc_curve(y_true, probs_rb)
plt.figure()
plt.plot(fpr_rb, tpr_rb, label=f"RoBERTa AUROC={auroc_rb:.3f}")
plt.plot([0,1],[0,1], linestyle="--")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC Curve — RoBERTa(+LSTM)")
plt.legend(loc="lower right"); plt.grid(True, linestyle=":"); plt.tight_layout()
plt.savefig("roc_curve_roberta.png", dpi=160); plt.show()

# PR
prec_rb_c, rec_rb_c, _ = precision_recall_curve(y_true, probs_rb)
plt.figure()
plt.plot(rec_rb_c, prec_rb_c, label=f"RoBERTa AP={ap_rb:.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("PR Curve — RoBERTa(+LSTM)")
plt.legend(loc="lower left"); plt.grid(True, linestyle=":"); plt.tight_layout()
plt.savefig("pr_curve_roberta.png", dpi=160); plt.show()

# Confusion matrix
cm_rb = confusion_matrix(y_true, y_pred_rb)
plt.figure()
plt.imshow(cm_rb, interpolation='nearest')
plt.title("Confusion Matrix — RoBERTa(+LSTM)")
plt.colorbar()
ticks = np.arange(2)
plt.xticks(ticks, ["Truthful (0)","Deceptive (1)"])
plt.yticks(ticks, ["Truthful (0)","Deceptive (1)"])
for i in range(cm_rb.shape[0]):
    for j in range(cm_rb.shape[1]):
        plt.text(j, i, format(cm_rb[i, j], 'd'), ha="center", va="center")
plt.ylabel('True'); plt.xlabel('Predicted'); plt.tight_layout()
plt.savefig("cm_roberta.png", dpi=160); plt.show()


# Compare Models

In [None]:

#@title 🧾 Side-by-side table
side = pd.DataFrame({
    "Metric": ["Accuracy","Precision","Recall","F1","AUROC","AP"],
    "TF-IDF+LogReg": [acc_lr, prec_lr, rec_lr, f1_lr, auroc_lr, ap_lr],
    "RoBERTa(+LSTM)": [acc_rb, prec_rb, rec_rb, f1_rb, auroc_rb, ap_rb],
})
display(side)
side.to_csv("baseline_vs_roberta.csv", index=False)
print("Saved: baseline_vs_roberta.csv")
