<a href="https://colab.research.google.com/github/markosnakos/Assignement-NLP-FakeNews/blob/main/FakeNews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

üîπ Cell 1 ‚Äî Install Dependencies

In [None]:
# ============================================================
# 0) Setup
# ============================================================
!pip -q install -U datasets transformers accelerate evaluate

!pip -q install -U wandb
import numpy as np
import random
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate


import wandb
wandb.login()  # Œ∏Œ± Œ∂Œ∑œÑŒÆœÉŒµŒπ key œÑŒ∑ŒΩ 1Œ∑ œÜŒøœÅŒ¨

wandb.init()




SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

# ============================================================
# 1) Load datasets (GossipCop++ + PolitiFact++)
#    Expect splits: HR, HF, MR, MF
# ============================================================
GC_NAME = "Jinyan1/GossipCop"
PF_NAME = "Jinyan1/PolitiFact"

def load_four_splits(ds_name):
    hr = load_dataset(ds_name, split="HR")
    hf = load_dataset(ds_name, split="HF")
    mr = load_dataset(ds_name, split="MR")
    mf = load_dataset(ds_name, split="MF")
    return hr, hf, mr, mf

gc_hr, gc_hf, gc_mr, gc_mf = load_four_splits(GC_NAME)
pf_hr, pf_hf, pf_mr, pf_mf = load_four_splits(PF_NAME)

print("GossipCop sizes:", len(gc_hr), len(gc_hf), len(gc_mr), len(gc_mf))
print("PolitiFact sizes:", len(pf_hr), len(pf_hf), len(pf_mr), len(pf_mf))

# ============================================================
# 2) Add labels + subclass
#    label: 0=real, 1=fake
#    subclass: HR/HF/MR/MF
# ============================================================
def add_labels(ds, subclass):
    is_fake = 1 if subclass in ["HF", "MF"] else 0
    ds = ds.add_column("label", [is_fake] * len(ds))
    ds = ds.add_column("subclass", [subclass] * len(ds))
    return ds

gc_hr = add_labels(gc_hr, "HR")
gc_hf = add_labels(gc_hf, "HF")
gc_mr = add_labels(gc_mr, "MR")
gc_mf = add_labels(gc_mf, "MF")

pf_hr = add_labels(pf_hr, "HR")
pf_hf = add_labels(pf_hf, "HF")
pf_mr = add_labels(pf_mr, "MR")
pf_mf = add_labels(pf_mf, "MF")

# ============================================================
# 3) Utilities: sampling + mixtures (paper-style settings)
# ============================================================
def sample_n(ds, n, seed=SEED):
    n = min(n, len(ds))
    return ds.shuffle(seed=seed).select(range(n))

def build_train_set(setting, hr, hf, mr, mf, mf_ratio=0.0,
                    real_size=2000, fake_size=2000, seed=SEED):
    """
    setting:
      - "human_legacy": real=HR
      - "transitional": real=HR+MR (simple approximation)
      - "machine_dominance": real=MR
    mf_ratio: fraction of fake examples that are MF (rest HF)
    """
    if setting == "human_legacy":
        real_pool = hr
    elif setting == "transitional":
        real_pool = concatenate_datasets([hr, mr])
    elif setting == "machine_dominance":
        real_pool = mr
    else:
        raise ValueError("setting must be: human_legacy, transitional, machine_dominance")

    real_train = sample_n(real_pool, real_size, seed=seed)

    mf_n = int(fake_size * mf_ratio)
    hf_n = fake_size - mf_n
    fake_train = concatenate_datasets([
        sample_n(hf, hf_n, seed=seed + 1),
        sample_n(mf, mf_n, seed=seed + 2),
    ])

    return concatenate_datasets([real_train, fake_train]).shuffle(seed=seed)

def build_test_set(hr, hf, mr, mf, test_size_each=500, seed=SEED):
    return concatenate_datasets([
        sample_n(hr, test_size_each, seed=seed + 10),
        sample_n(hf, test_size_each, seed=seed + 11),
        sample_n(mr, test_size_each, seed=seed + 12),
        sample_n(mf, test_size_each, seed=seed + 13),
    ]).shuffle(seed=seed)

gc_test = build_test_set(gc_hr, gc_hf, gc_mr, gc_mf, test_size_each=500)
pf_test = build_test_set(pf_hr, pf_hf, pf_mr, pf_mf, test_size_each=200)

print("GC test:", len(gc_test), "PF test:", len(pf_test))

# ============================================================
# 4) Tokenization (T4 fast + robust)
# ============================================================
MODEL_NAME = "roberta-base"      # faster alt: "distilroberta-base"
MAX_LEN = 128                    # speedup vs 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def make_input(example):
    title = example.get("title", "") or ""
    desc  = example.get("description", "") or ""
    text  = example.get("text", "") or ""

    joined = (title + " " + desc).strip()
    if len(joined) < 5:
        joined = text
    if joined is None or len(str(joined).strip()) == 0:
        joined = " "
    return {"input_text": joined}

def tokenize(batch):
    return tokenizer(batch["input_text"], truncation=True, max_length=MAX_LEN)

def prepare(ds, keep_subclass=False):
    ds = ds.map(make_input)
    ds = ds.map(tokenize, batched=True, remove_columns=["input_text"])
    base_cols = ["input_ids", "attention_mask", "label"]
    if keep_subclass:
        base_cols.append("subclass")
    keep = [c for c in base_cols if c in ds.column_names]
    return ds.select_columns(keep)

# ============================================================
# 5) Build experiment datasets
# ============================================================
SETTING = "human_legacy"   # "transitional" / "machine_dominance"
MF_RATIO = 0.50            # 0.0 / 0.33 / 0.5 / 0.67 / 1.0

train_raw = build_train_set(
    setting=SETTING,
    hr=gc_hr, hf=gc_hf, mr=gc_mr, mf=gc_mf,
    mf_ratio=MF_RATIO,
    real_size=2000, fake_size=2000,  # increase later if you want
)

# IMPORTANT:
# - train_ds: NO subclass (so collator won't crash)
# - eval_ds: keep subclass for metrics, but we will remove it right before predict
train_ds = prepare(train_raw, keep_subclass=False)
gc_eval  = prepare(gc_test, keep_subclass=True)
pf_eval  = prepare(pf_test, keep_subclass=True)

print("Train columns:", train_ds.column_names)
print("Train sample:", train_ds[0])
assert "input_ids" in train_ds.column_names and "attention_mask" in train_ds.column_names, "Tokenization failed!"
assert "label" in train_ds.column_names, "Missing label!"

# ============================================================
# 6) Metrics: overall + subclass-wise accuracy
# ============================================================
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return acc.compute(predictions=preds, references=labels)

def subclass_accuracy(trainer, eval_ds_with_subclass):
    subclasses = eval_ds_with_subclass["subclass"]
    eval_ds = eval_ds_with_subclass.remove_columns(["subclass"])

    out = trainer.predict(eval_ds)
    preds = np.argmax(out.predictions, axis=-1)
    labels = out.label_ids

    results = {"overall_acc": float((preds == labels).mean())}
    for sc in ["HR", "HF", "MR", "MF"]:
        idx = [i for i, s in enumerate(subclasses) if s == sc]
        if idx:
            results[f"acc_{sc}"] = float((preds[idx] == labels[idx]).mean())
    return results

# ============================================================
# 7) Train (T4 optimized; Solution-2 compatible)
# ============================================================
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir=f"./runs_{SETTING}_mf{MF_RATIO}",
    learning_rate=3e-5,
    per_device_train_batch_size=32,      # if OOM -> 16
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,       # effective batch ~64
    num_train_epochs=5,                  # increase to 2-3 later
    weight_decay=0.01,
    logging_steps=25,
    seed=SEED,
    fp16=True,                           # speed on T4
    dataloader_num_workers=0,            # stable
    remove_unused_columns=False,         # keep tensor fields
    report_to=["wandb"],                 # enable Weights & Biases
    run_name=f"tsoumi_{SETTING}_mf{MF_RATIO}",                # disable wandb prompts
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=gc_eval.remove_columns(["subclass"]),  # safe
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print("\n=== In-domain (GossipCop++) subclass metrics ===")
print(subclass_accuracy(trainer, gc_eval))

print("\n=== Out-of-domain (PolitiFact++) subclass metrics ===")
print(subclass_accuracy(trainer, pf_eval))


In [None]:
import torch
import numpy as np

id2label = {0: "real", 1: "fake"}

def predict_text(text: str, model, tokenizer, max_len=128):
    model.eval()
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0].detach().cpu().numpy()

    probs = np.exp(logits) / np.exp(logits).sum()
    pred = int(np.argmax(probs))

    return {
        "prediction": id2label[pred],
        "p_real": float(probs[0]),
        "p_fake": float(probs[1]),
    }

# Œ†Œ±œÅŒ¨Œ¥ŒµŒπŒ≥ŒºŒ±:
my_text = "A popular film actor appeared at the official premiere of their new movie on Friday, accompanied by the director and co-stars. During a brief interview, the actor spoke about the challenges of the role and expressed gratitude for the support received from the production team. The event proceeded as scheduled and was widely covered by established entertainment news outlets."
print(predict_text(my_text, trainer.model, tokenizer, max_len=MAX_LEN))
