<a href="https://colab.research.google.com/github/markosnakos/Assignement-NLP-FakeNews/blob/main/FakeNews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

🔹 Cell 1 — Install Dependencies

In [None]:
# ============================================================
# 0) Setup
# ============================================================
!pip -q install -U datasets transformers accelerate evaluate

!pip -q install -U wandb
import numpy as np
import random
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate


import wandb
wandb.login()  # θα ζητήσει key την 1η φορά

wandb.init()




SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

# ============================================================
# 1) Load datasets (GossipCop++ + PolitiFact++)
#    Expect splits: HR, HF, MR, MF
# ============================================================
GC_NAME = "Jinyan1/GossipCop"
PF_NAME = "Jinyan1/PolitiFact"

def load_four_splits(ds_name):
    hr = load_dataset(ds_name, split="HR")
    hf = load_dataset(ds_name, split="HF")
    mr = load_dataset(ds_name, split="MR")
    mf = load_dataset(ds_name, split="MF")
    return hr, hf, mr, mf

gc_hr, gc_hf, gc_mr, gc_mf = load_four_splits(GC_NAME)
pf_hr, pf_hf, pf_mr, pf_mf = load_four_splits(PF_NAME)

print("GossipCop sizes:", len(gc_hr), len(gc_hf), len(gc_mr), len(gc_mf))
print("PolitiFact sizes:", len(pf_hr), len(pf_hf), len(pf_mr), len(pf_mf))

# ============================================================
# 2) Add labels + subclass
#    label: 0=real, 1=fake
#    subclass: HR/HF/MR/MF
# ============================================================
def add_labels(ds, subclass):
    is_fake = 1 if subclass in ["HF", "MF"] else 0
    ds = ds.add_column("label", [is_fake] * len(ds))
    ds = ds.add_column("subclass", [subclass] * len(ds))
    return ds

gc_hr = add_labels(gc_hr, "HR")
gc_hf = add_labels(gc_hf, "HF")
gc_mr = add_labels(gc_mr, "MR")
gc_mf = add_labels(gc_mf, "MF")

pf_hr = add_labels(pf_hr, "HR")
pf_hf = add_labels(pf_hf, "HF")
pf_mr = add_labels(pf_mr, "MR")
pf_mf = add_labels(pf_mf, "MF")

# ============================================================
# 3) Utilities: sampling + mixtures (paper-style settings)
# ============================================================
def sample_n(ds, n, seed=SEED):
    n = min(n, len(ds))
    return ds.shuffle(seed=seed).select(range(n))

def build_train_set(setting, hr, hf, mr, mf, mf_ratio=0.0,
                    real_size=2000, fake_size=2000, seed=SEED):
    """
    setting:
      - "human_legacy": real=HR
      - "transitional": real=HR+MR (simple approximation)
      - "machine_dominance": real=MR
    mf_ratio: fraction of fake examples that are MF (rest HF)
    """
    if setting == "human_legacy":
        real_pool = hr
    elif setting == "transitional":
        real_pool = concatenate_datasets([hr, mr])
    elif setting == "machine_dominance":
        real_pool = mr
    else:
        raise ValueError("setting must be: human_legacy, transitional, machine_dominance")

    real_train = sample_n(real_pool, real_size, seed=seed)

    mf_n = int(fake_size * mf_ratio)
    hf_n = fake_size - mf_n
    fake_train = concatenate_datasets([
        sample_n(hf, hf_n, seed=seed + 1),
        sample_n(mf, mf_n, seed=seed + 2),
    ])

    return concatenate_datasets([real_train, fake_train]).shuffle(seed=seed)

def build_test_set(hr, hf, mr, mf, test_size_each=500, seed=SEED):
    return concatenate_datasets([
        sample_n(hr, test_size_each, seed=seed + 10),
        sample_n(hf, test_size_each, seed=seed + 11),
        sample_n(mr, test_size_each, seed=seed + 12),
        sample_n(mf, test_size_each, seed=seed + 13),
    ]).shuffle(seed=seed)

gc_test = build_test_set(gc_hr, gc_hf, gc_mr, gc_mf, test_size_each=500)
pf_test = build_test_set(pf_hr, pf_hf, pf_mr, pf_mf, test_size_each=200)

print("GC test:", len(gc_test), "PF test:", len(pf_test))

# ============================================================
# 4) Tokenization (T4 fast + robust)
# ============================================================
MODEL_NAME = "roberta-base"      # faster alt: "distilroberta-base"
MAX_LEN = 128                    # speedup vs 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def make_input(example):
    title = example.get("title", "") or ""
    desc  = example.get("description", "") or ""
    text  = example.get("text", "") or ""

    joined = (title + " " + desc).strip()
    if len(joined) < 5:
        joined = text
    if joined is None or len(str(joined).strip()) == 0:
        joined = " "
    return {"input_text": joined}

def tokenize(batch):
    return tokenizer(batch["input_text"], truncation=True, max_length=MAX_LEN)

def prepare(ds, keep_subclass=False):
    ds = ds.map(make_input)
    ds = ds.map(tokenize, batched=True, remove_columns=["input_text"])
    base_cols = ["input_ids", "attention_mask", "label"]
    if keep_subclass:
        base_cols.append("subclass")
    keep = [c for c in base_cols if c in ds.column_names]
    return ds.select_columns(keep)

# ============================================================
# 5) Build experiment datasets
# ============================================================
SETTING = "human_legacy"   # "transitional" / "machine_dominance"
MF_RATIO = 0.50            # 0.0 / 0.33 / 0.5 / 0.67 / 1.0

train_raw = build_train_set(
    setting=SETTING,
    hr=gc_hr, hf=gc_hf, mr=gc_mr, mf=gc_mf,
    mf_ratio=MF_RATIO,
    real_size=2000, fake_size=2000,  # increase later if you want
)

# IMPORTANT:
# - train_ds: NO subclass (so collator won't crash)
# - eval_ds: keep subclass for metrics, but we will remove it right before predict
train_ds = prepare(train_raw, keep_subclass=False)
gc_eval  = prepare(gc_test, keep_subclass=True)
pf_eval  = prepare(pf_test, keep_subclass=True)

print("Train columns:", train_ds.column_names)
print("Train sample:", train_ds[0])
assert "input_ids" in train_ds.column_names and "attention_mask" in train_ds.column_names, "Tokenization failed!"
assert "label" in train_ds.column_names, "Missing label!"

# ============================================================
# 6) Metrics: overall + subclass-wise accuracy
# ============================================================
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return acc.compute(predictions=preds, references=labels)

def subclass_accuracy(trainer, eval_ds_with_subclass):
    subclasses = eval_ds_with_subclass["subclass"]
    eval_ds = eval_ds_with_subclass.remove_columns(["subclass"])

    out = trainer.predict(eval_ds)
    preds = np.argmax(out.predictions, axis=-1)
    labels = out.label_ids

    results = {"overall_acc": float((preds == labels).mean())}
    for sc in ["HR", "HF", "MR", "MF"]:
        idx = [i for i, s in enumerate(subclasses) if s == sc]
        if idx:
            results[f"acc_{sc}"] = float((preds[idx] == labels[idx]).mean())
    return results

# ============================================================
# 7) Train (T4 optimized; Solution-2 compatible)
# ============================================================
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir=f"./runs_{SETTING}_mf{MF_RATIO}",
    learning_rate=3e-5,
    per_device_train_batch_size=32,      # if OOM -> 16
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,       # effective batch ~64
    num_train_epochs=5,                  # increase to 2-3 later
    weight_decay=0.01,
    logging_steps=25,
    seed=SEED,
    fp16=True,                           # speed on T4
    dataloader_num_workers=0,            # stable
    remove_unused_columns=False,         # keep tensor fields
    report_to=["wandb"],                 # enable Weights & Biases
    run_name=f"tsoumi_{SETTING}_mf{MF_RATIO}",                # disable wandb prompts
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=gc_eval.remove_columns(["subclass"]),  # safe
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print("\n=== In-domain (GossipCop++) subclass metrics ===")
print(subclass_accuracy(trainer, gc_eval))

print("\n=== Out-of-domain (PolitiFact++) subclass metrics ===")
print(subclass_accuracy(trainer, pf_eval))


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/512.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m512.0/512.3 kB[0m [31m31.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25h

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33melefniko[0m ([33melefniko-aristotle-universoty-of-thessaloniki[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Device: cuda
GPU: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/817 [00:00<?, ?B/s]

data/MF-00000-of-00001-2d256f82f8c8e2dd.(…):   0%|          | 0.00/3.83M [00:00<?, ?B/s]

data/HF-00000-of-00001-b7ad0013efd98ff4.(…):   0%|          | 0.00/7.39M [00:00<?, ?B/s]

data/MR-00000-of-00001-c9324d9fd00efb16.(…):   0%|          | 0.00/6.89M [00:00<?, ?B/s]

data/HR-00000-of-00001-043a35ac2a425b62.(…):   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Generating MF split:   0%|          | 0/4084 [00:00<?, ? examples/s]

Generating HF split:   0%|          | 0/4084 [00:00<?, ? examples/s]

Generating MR split:   0%|          | 0/4169 [00:00<?, ? examples/s]

Generating HR split:   0%|          | 0/8168 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/804 [00:00<?, ?B/s]

data/MF-00000-of-00001-76b8ff6de79a2e48.(…):   0%|          | 0.00/113k [00:00<?, ?B/s]

data/HF-00000-of-00001-3310dd7e7f985d8b.(…):   0%|          | 0.00/176k [00:00<?, ?B/s]

data/MR-00000-of-00001-13e60e455a122412.(…):   0%|          | 0.00/388k [00:00<?, ?B/s]

data/HR-00000-of-00001-62355cc59d0eaf41.(…):   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Generating MF split:   0%|          | 0/97 [00:00<?, ? examples/s]

Generating HF split:   0%|          | 0/97 [00:00<?, ? examples/s]

Generating MR split:   0%|          | 0/132 [00:00<?, ? examples/s]

Generating HR split:   0%|          | 0/194 [00:00<?, ? examples/s]

GossipCop sizes: 8168 4084 4169 4084
PolitiFact sizes: 194 97 132 97
GC test: 2000 PF test: 520


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Train columns: ['input_ids', 'attention_mask', 'label']
Train sample: {'input_ids': [0, 104, 1113, 102, 226, 15322, 840, 12061, 1405, 3928, 286, 128, 487, 3340, 5846, 7989, 572, 108, 359, 85, 2780, 1405, 3676, 37123, 1405, 7278, 20333, 2011, 9154, 3224, 2306, 5, 768, 9, 69, 291, 12, 180, 756, 6, 24497, 102, 226, 15322, 34, 11229, 2864, 7, 310, 171, 430, 4502, 4, 3507, 150, 79, 1059, 41, 8340, 13, 3437, 359, 12610, 8, 10, 9212, 13, 35037, 41969, 6, 79, 18, 393, 56, 7, 109, 932, 1341, 101, 99, 69, 8946, 822, 1174, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0}


Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
25,0.6565
50,0.5163
75,0.4464
100,0.3893
125,0.382
150,0.2756
175,0.2828
200,0.2378
225,0.1937
250,0.2224



=== In-domain (GossipCop++) subclass metrics ===


{'overall_acc': 0.8405, 'acc_HR': 0.832, 'acc_HF': 0.838, 'acc_MR': 0.758, 'acc_MF': 0.934}

=== Out-of-domain (PolitiFact++) subclass metrics ===


{'overall_acc': 0.5634615384615385, 'acc_HR': 0.5670103092783505, 'acc_HF': 0.5979381443298969, 'acc_MR': 0.38636363636363635, 'acc_MF': 0.7628865979381443}


In [None]:
import torch
import numpy as np

id2label = {0: "real", 1: "fake"}

def predict_text(text: str, model, tokenizer, max_len=128):
    model.eval()
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0].detach().cpu().numpy()

    probs = np.exp(logits) / np.exp(logits).sum()
    pred = int(np.argmax(probs))

    return {
        "prediction": id2label[pred],
        "p_real": float(probs[0]),
        "p_fake": float(probs[1]),
    }

# Παράδειγμα:
my_text = "A popular film actor appeared at the official premiere of their new movie on Friday, accompanied by the director and co-stars. During a brief interview, the actor spoke about the challenges of the role and expressed gratitude for the support received from the production team. The event proceeded as scheduled and was widely covered by established entertainment news outlets."
print(predict_text(my_text, trainer.model, tokenizer, max_len=MAX_LEN))


{'prediction': 'real', 'p_real': 0.9483462572097778, 'p_fake': 0.05165378376841545}
