In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from torch.optim import AdamW

In [None]:
import pandas as pd
import numpy as np
import torch
import re
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, get_scheduler
from torch.optim import Adam
from tqdm import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_df = pd.read_csv("/kaggle/input/liar-dataset/train.tsv", sep="\t", header=None)
val_df   = pd.read_csv("/kaggle/input/liar-dataset/valid.tsv", sep="\t", header=None)
test_df  = pd.read_csv("/kaggle/input/liar-dataset/test.tsv", sep="\t", header=None)

cols = ["id", "label", "statement", "subject", "speaker", "job_title", "state", "party",
        "barely_true_ct", "false_ct", "half_true_ct", "mostly_true_ct", "pants_fire_ct",
        "context"]
train_df.columns = cols
val_df.columns = cols
test_df.columns = cols

In [None]:
label_map = {
    "pants-fire": 0,
    "false": 0,
    "barely-true": 0,
    "half-true": 0,
    "mostly-true": 1,
    "true": 1
}
for df in [train_df, val_df, test_df]:
    df["label"] = df["label"].map(label_map)

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

for df in [train_df, val_df, test_df]:
    df["statement"] = df["statement"].apply(clean_text)
    df["context"]   = df["context"].fillna("").apply(clean_text)
    df["speaker"]   = df["speaker"].fillna("").apply(clean_text)
    df["party"]     = df["party"].fillna("").apply(clean_text)

In [None]:
def combine_text(row):
    return (
        row["statement"] + " [SEP] speaker: " + row["speaker"] +
        " party: " + row["party"] + " context: " + row["context"]
    )

train_texts = train_df.apply(combine_text, axis=1).tolist()
val_texts   = val_df.apply(combine_text, axis=1).tolist()
test_texts  = test_df.apply(combine_text, axis=1).tolist()

train_labels = train_df["label"].tolist()
val_labels   = val_df["label"].tolist()
test_labels  = test_df["label"].tolist()

In [None]:
class LIARDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }


In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

train_dataset = LIARDataset(train_texts, train_labels, tokenizer)
val_dataset   = LIARDataset(val_texts, val_labels, tokenizer)
test_dataset  = LIARDataset(test_texts, test_labels, tokenizer)

class_counts = train_df["label"].value_counts().to_dict()
class_weights = [1.0 / class_counts[label] for label in train_df["label"]]
sampler = WeightedRandomSampler(class_weights, num_samples=len(class_weights), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=16, sampler=sampler)
val_loader   = DataLoader(val_dataset, batch_size=16)
test_loader  = DataLoader(test_dataset, batch_size=16)

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 6  # 6 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer,
                             num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
def train_model(model, train_loader, val_loader, optimizer, scheduler, loss_fn, epochs=6):
    best_f1, patience, patience_counter = 0, 2, 0

    for epoch in range(epochs):
        model.train()
        loop = tqdm(train_loader, leave=True)
        total_loss, total_acc = 0, 0

        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            preds = torch.argmax(logits, dim=1)
            acc = (preds == labels).cpu().numpy().mean()

            total_loss += loss.item()
            total_acc += acc

            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item(), acc=acc)

        avg_loss = total_loss / len(train_loader)
        avg_acc = total_acc / len(train_loader)
        print(f"\nEpoch {epoch+1} | Train Loss: {avg_loss:.4f} | Train Acc: {avg_acc:.4f}")

        val_acc, val_f1 = evaluate(model, val_loader, split="Validation")

        # Early stopping
        if val_f1 > best_f1:
            torch.save(model.state_dict(), "best_roberta.pt")
            best_f1 = val_f1
            patience_counter = 0
            print(" Saved new best model")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(" Early stopping triggered")
                break

In [None]:
def evaluate(model, loader, split="Test"):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            label = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1)

            preds.extend(pred.cpu().numpy())
            labels.extend(label.cpu().numpy())

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    print(f"{split} Accuracy: {acc:.4f}, {split} F1: {f1:.4f}")
    if split != "Validation":
        print(classification_report(labels, preds))
        print("Confusion Matrix:\n", confusion_matrix(labels, preds))
    return acc, f1


In [None]:
train_model(model, train_loader, val_loader, optimizer, lr_scheduler, loss_fn, epochs=6)


model.load_state_dict(torch.load("best_roberta.pt"))
evaluate(model, test_loader, split="Final Test")

In [None]:
import torch
import shutil


save_dir = "/kaggle/working/roberta_liar_model"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

extra_info = {
    "task": "Fake News Detection - LIAR dataset (binary)",
    "model": "roberta-base",
    "final_test_accuracy": float(test_acc),
    "final_test_f1": float(test_f1)
}

import json
with open(save_dir + "/meta.json", "w") as f:
    json.dump(extra_info, f, indent=4)

# 3. Zip the folder
shutil.make_archive("/kaggle/working/roberta_liar_model", 'zip', save_dir)

print("Model + tokenizer saved and zipped in /kaggle/working/")
