<a href="https://colab.research.google.com/github/markosnakos/Assignement-NLP-FakeNews/blob/main/hierarchical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# Hierarchical Fake News Classification with W&B
# ============================================================

!pip -q install -U datasets transformers accelerate evaluate wandb

import random
import numpy as np
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate
import wandb

# =========================
# 0) Setup
# =========================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

wandb.login()

MODEL_NAME = "roberta-base"
MAX_LEN = 128

# =========================
# 1) Load datasets
# =========================
GC_NAME = "Jinyan1/GossipCop"
PF_NAME = "Jinyan1/PolitiFact"

def load_four_splits(name):
    return (
        load_dataset(name, split="HR"),
        load_dataset(name, split="HF"),
        load_dataset(name, split="MR"),
        load_dataset(name, split="MF"),
    )

gc_hr, gc_hf, gc_mr, gc_mf = load_four_splits(GC_NAME)
pf_hr, pf_hf, pf_mr, pf_mf = load_four_splits(PF_NAME)

# =========================
# 2) Tokenization utilities
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
collator = DataCollatorWithPadding(tokenizer)

def make_input(ex):
    title = ex.get("title", "") or ""
    desc  = ex.get("description", "") or ""
    text  = ex.get("text", "") or ""
    joined = (title + " " + desc).strip()
    return {"input_text": joined if len(joined) > 5 else text or " "}

def tokenize(batch):
    return tokenizer(batch["input_text"], truncation=True, max_length=MAX_LEN)

def prepare(ds):
    ds = ds.map(make_input)
    ds = ds.map(tokenize, batched=True, remove_columns=["input_text"])
    return ds.select_columns(["input_ids", "attention_mask", "label"])

# =========================
# 3) Dataset builders
# =========================
def add_label(ds, label):
    return ds.add_column("label", [label] * len(ds))

# ---- Authorship (Human vs Machine)
auth_train = concatenate_datasets([
    add_label(gc_hr, 0), add_label(gc_hf, 0),
    add_label(gc_mr, 1), add_label(gc_mf, 1),
]).shuffle(seed=SEED)

auth_test = concatenate_datasets([
    add_label(pf_hr, 0), add_label(pf_hf, 0),
    add_label(pf_mr, 1), add_label(pf_mf, 1),
]).shuffle(seed=SEED)

auth_train, auth_test = prepare(auth_train), prepare(auth_test)

# ---- Human veracity (HR vs HF)
human_train = concatenate_datasets([
    add_label(gc_hr, 0), add_label(gc_hf, 1),
]).shuffle(seed=SEED)

human_test = concatenate_datasets([
    add_label(pf_hr, 0), add_label(pf_hf, 1),
]).shuffle(seed=SEED)

human_train, human_test = prepare(human_train), prepare(human_test)

# ---- Machine veracity (MR vs MF)
machine_train = concatenate_datasets([
    add_label(gc_mr, 0), add_label(gc_mf, 1),
]).shuffle(seed=SEED)

machine_test = concatenate_datasets([
    add_label(pf_mr, 0), add_label(pf_mf, 1),
]).shuffle(seed=SEED)

machine_train, machine_test = prepare(machine_train), prepare(machine_test)

# =========================
# 4) Training utilities
# =========================
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return acc.compute(predictions=preds, references=labels)

def train_model(train_ds, eval_ds, run_name):
    wandb.init(
        project="hierarchical-fake-news",
        name=run_name,
        config={
            "model": MODEL_NAME,
            "max_len": MAX_LEN,
            "epochs": 5,
            "batch_size": 32,
            "lr": 3e-5,
        },
        reinit=True,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=2
    )

    args = TrainingArguments(
        output_dir=f"./{run_name}",
        learning_rate=3e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        gradient_accumulation_steps=2,
        num_train_epochs=5,
        fp16=True,
        logging_steps=50,
        seed=SEED,
        report_to=["wandb"],
        remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    wandb.finish()
    return trainer

# =========================
# 5) Train all models
# =========================
trainer_auth = train_model(auth_train, auth_test, "Stage1_Authorship")
trainer_human = train_model(human_train, human_test, "Stage2_Human_Veracity")
trainer_machine = train_model(machine_train, machine_test, "Stage2_Machine_Veracity")

# =========================
# 6) Hierarchical inference
# =========================
def hierarchical_predict(dataset):
    auth_out = trainer_auth.predict(dataset)
    auth_preds = np.argmax(auth_out.predictions, axis=-1)

    final_preds = []
    for i, a in enumerate(auth_preds):
        sample = dataset.select([i])
        if a == 0:
            out = trainer_human.predict(sample)
            final_preds.append("HR" if np.argmax(out.predictions) == 0 else "HF")
        else:
            out = trainer_machine.predict(sample)
            final_preds.append("MR" if np.argmax(out.predictions) == 0 else "MF")
    return final_preds

print("Hierarchical pipeline ready.")

