In [1]:
%pip install datasets transformers torch scikit-learn pandas accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import torch

from pipeline import load_kaggle, load_gossipcop, load_politifact
from datasets import Dataset 
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from pipeline import load_kaggle, clean_dataset
from sklearn.model_selection import train_test_split

df = load_kaggle()
df = clean_dataset(df)
df["text"] = (df["title"] + " " + df["text"]).str.strip()
df = df[["text", "label"]]
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

df_train.head()

print("\n" + "="*80)
print("LOADING ADDITIONAL DATASETS FOR GENERALIZATION TESTING")
print("="*80 + "\n")

# Load PolitiFact
df_politifact = load_politifact()
if df_politifact is not None:
    df_politifact = clean_dataset(df_politifact)
    df_politifact["text"] = (df_politifact["title"] + " " + df_politifact["text"]).str.strip()
    df_politifact = df_politifact[["text", "label"]]
    print(f"PolitiFact loaded: {len(df_politifact)} articles")
else:
    print("‚ö†Ô∏è  Skipping PolitiFact (not available)")

# Load GossipCop
df_gossipcop = load_gossipcop()
if df_gossipcop is not None:
    df_gossipcop = clean_dataset(df_gossipcop)
    df_gossipcop["text"] = (df_gossipcop["title"] + " " + df_gossipcop["text"]).str.strip()
    df_gossipcop = df_gossipcop[["text", "label"]]
    print(f"GossipCop loaded: {len(df_gossipcop)} articles")
else:
    print("‚ö†Ô∏è  Skipping GossipCop (not available)")

# Show dataset sizes
print(f"\nDataset Summary:")
print(f"   Kaggle train: {len(df_train)} articles")
print(f"   Kaggle test: {len(df_test)} articles")
if df_politifact is not None:
    print(f"   PolitiFact: {len(df_politifact)} articles")
if df_gossipcop is not None:
    print(f"   GossipCop: {len(df_gossipcop)} articles")



LOADING ADDITIONAL DATASETS FOR GENERALIZATION TESTING

Loading PolitiFact from: data_files/processed/politifact_combined.csv
PolitiFact loaded: 624 articles
Loading GossipCop from: data_files/processed/gossipcop_combined.csv
GossipCop loaded: 14549 articles

Dataset Summary:
   Kaggle train: 30915 articles
   Kaggle test: 7729 articles
   PolitiFact: 624 articles
   GossipCop: 14549 articles


In [4]:
print("=== LOADING ADDITIONAL DATASETS FOR GENERALIZATION ===\n")

# Load PolitiFact
df_politifact = load_politifact()
df_politifact = clean_dataset(df_politifact)
df_politifact["text"] = (df_politifact["title"] + " " + df_politifact["text"]).str.strip()
df_politifact = df_politifact[["text", "label"]]

# Load GossipCop
df_gossipcop = load_gossipcop()
df_gossipcop = clean_dataset(df_gossipcop)
df_gossipcop["text"] = (df_gossipcop["title"] + " " + df_gossipcop["text"]).str.strip()
df_gossipcop = df_gossipcop[["text", "label"]]

# Show dataset sizes
print(f"Kaggle train size: {len(df_train)}")
print(f"Kaggle test size: {len(df_test)}")
print(f"PolitiFact size: {len(df_politifact)}")
print(f"GossipCop size: {len(df_gossipcop)}")

# Show class distributions
print("\n=== CLASS DISTRIBUTIONS ===")
print(f"Kaggle test - Real: {(df_test['label']==1).sum()}, Fake: {(df_test['label']==0).sum()}")
print(f"PolitiFact - Real: {(df_politifact['label']==1).sum()}, Fake: {(df_politifact['label']==0).sum()}")
print(f"GossipCop - Real: {(df_gossipcop['label']==1).sum()}, Fake: {(df_gossipcop['label']==0).sum()}")

=== LOADING ADDITIONAL DATASETS FOR GENERALIZATION ===

Loading PolitiFact from: data_files/processed/politifact_combined.csv
Loading GossipCop from: data_files/processed/gossipcop_combined.csv
Kaggle train size: 30915
Kaggle test size: 7729
PolitiFact size: 624
GossipCop size: 14549

=== CLASS DISTRIBUTIONS ===
Kaggle test - Real: 4238, Fake: 3491
PolitiFact - Real: 308, Fake: 316
GossipCop - Real: 11159, Fake: 3390


In [5]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=256,
    )

train_ds = Dataset.from_pandas(df_train.reset_index(drop=True))
test_ds  = Dataset.from_pandas(df_test.reset_index(drop=True))

tokenized_train = train_ds.map(
    tokenize,
    batched=True,
    remove_columns=['text']
)

tokenized_test = test_ds.map(
    tokenize,
    batched=True,
    remove_columns=['text']
)

print("Tokenized train features:", tokenized_train.features)

print("\n" + "="*80)
print("TOKENIZING GENERALIZATION DATASETS")
print("="*80 + "\n")

# Tokenize PolitiFact
if df_politifact is not None:
    politifact_ds = Dataset.from_pandas(df_politifact.reset_index(drop=True))
    tokenized_politifact = politifact_ds.map(
        tokenize,
        batched=True,
        remove_columns=['text']
    )
    print(" PolitiFact tokenized")
else:
    tokenized_politifact = None

# Tokenize GossipCop
if df_gossipcop is not None:
    gossipcop_ds = Dataset.from_pandas(df_gossipcop.reset_index(drop=True))
    tokenized_gossipcop = gossipcop_ds.map(
        tokenize,
        batched=True,
        remove_columns=['text']
    )
    print(" GossipCop tokenized")
else:
    tokenized_gossipcop = None

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30915/30915 [00:06<00:00, 4506.45 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7729/7729 [00:01<00:00, 4210.04 examples/s]


Tokenized train features: {'label': Value('int64'), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8'))}

TOKENIZING GENERALIZATION DATASETS



Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 624/624 [00:00<00:00, 982.35 examples/s]


 PolitiFact tokenized


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14549/14549 [00:05<00:00, 2625.18 examples/s]

 GossipCop tokenized





In [6]:
print("=== TOKENIZING GENERALIZATION DATASETS ===\n")

politifact_ds = Dataset.from_pandas(df_politifact.reset_index(drop=True))
gossipcop_ds = Dataset.from_pandas(df_gossipcop.reset_index(drop=True))

# Tokenize
tokenized_politifact = politifact_ds.map(
    tokenize,
    batched=True,
    remove_columns=['text']
)

tokenized_gossipcop = gossipcop_ds.map(
    tokenize,
    batched=True,
    remove_columns=['text']
)

=== TOKENIZING GENERALIZATION DATASETS ===



Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 624/624 [00:00<00:00, 1071.09 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14549/14549 [00:05<00:00, 2757.60 examples/s]


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
    }

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0046,0.009773,0.998447,0.998432
2,0.0,0.005023,0.999094,0.999086




TrainOutput(global_step=3866, training_loss=0.00885973997528299, metrics={'train_runtime': 4749.5372, 'train_samples_per_second': 13.018, 'train_steps_per_second': 0.814, 'total_flos': 4095229629450240.0, 'train_loss': 0.00885973997528299, 'epoch': 2.0})

In [9]:
metrics = trainer.evaluate()
metrics



{'eval_loss': 0.005022841971367598,
 'eval_accuracy': 0.9990943200931557,
 'eval_f1': 0.9990858953412933,
 'eval_runtime': 189.4435,
 'eval_samples_per_second': 40.798,
 'eval_steps_per_second': 1.277,
 'epoch': 2.0}

In [10]:
preds = trainer.predict(tokenized_test)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)
errors = df_test.copy()
errors["pred"] = y_pred
errors["correct"] = errors["label"] == errors["pred"]

print(f"Total errors: {(~errors['correct']).sum()} out of {len(errors)}")
errors[~errors["correct"]].head(10)



Total errors: 7 out of 7729


Unnamed: 0,text,label,pred,correct
30754,house freedom caucus pressured by very familia...,0,1,False
14842,a difficult life as refugee arrivals to greece...,1,0,False
25043,turkeys parliament approves budget istanbulreu...,1,0,False
9639,the adoration of kim jong un pyongyang north k...,1,0,False
37044,run or wait tokyos koike faces dilemma ahead o...,1,0,False
24772,before debate protesters build wall of taco tr...,1,0,False
19433,factbox the race to the yous presidential nomi...,1,0,False


In [11]:
print("\n" + "="*80)
print("CROSS-DATASET GENERALIZATION TESTING")
print("="*80 + "\n")

# Test on Kaggle (in-domain)
print("KAGGLE TEST SET (In-Domain):")
kaggle_results = trainer.evaluate(tokenized_test)
print(f"   Accuracy: {kaggle_results['eval_accuracy']:.4f}")
print(f"   F1 Score: {kaggle_results['eval_f1']:.4f}")

# Test on PolitiFact (out-of-domain)
print("\nPOLITIFACT (Out-of-Domain - Politics):")
politifact_results = trainer.evaluate(tokenized_politifact)
print(f"   Accuracy: {politifact_results['eval_accuracy']:.4f}")
print(f"   F1 Score: {politifact_results['eval_f1']:.4f}")

# Test on GossipCop (out-of-domain)
print("\nGOSSIPCOP (Out-of-Domain - Entertainment):")
gossipcop_results = trainer.evaluate(tokenized_gossipcop)
print(f"   Accuracy: {gossipcop_results['eval_accuracy']:.4f}")
print(f"   F1 Score: {gossipcop_results['eval_f1']:.4f}")

# Summary comparison
print("\n" + "="*80)
print("SUMMARY: Does DistilBERT Generalize Better Than Classical ML?")
print("="*80)

comparison = pd.DataFrame({
    'Dataset': ['Kaggle', 'PolitiFact', 'GossipCop'],
    'DistilBERT': [
        f"{kaggle_results['eval_accuracy']:.2%}",
        f"{politifact_results['eval_accuracy']:.2%}",
        f"{gossipcop_results['eval_accuracy']:.2%}"
    ],
    'Logistic Reg (Baseline)': ['99.12%', '54.81%', '25.04%'],
    'SVM (Baseline)': ['99.72%', '53.04%', '26.83%']
})

print(comparison.to_string(index=False))


CROSS-DATASET GENERALIZATION TESTING

KAGGLE TEST SET (In-Domain):




   Accuracy: 0.9991
   F1 Score: 0.9991

POLITIFACT (Out-of-Domain - Politics):




   Accuracy: 0.5288
   F1 Score: 0.4502

GOSSIPCOP (Out-of-Domain - Entertainment):




   Accuracy: 0.2467
   F1 Score: 0.2119

SUMMARY: Does DistilBERT Generalize Better Than Classical ML?
   Dataset DistilBERT Logistic Reg (Baseline) SVM (Baseline)
    Kaggle     99.91%                  99.12%         99.72%
PolitiFact     52.88%                  54.81%         53.04%
 GossipCop     24.67%                  25.04%         26.83%


In [12]:
# checking class imbalance
print("\n" + "="*80)
print("CHECKING CLASS IMBALANCE")
print("="*80 + "\n")

print("Kaggle Test:")
print(f"  Fake (0): {(df_test['label']==0).sum()}")
print(f"  Real (1): {(df_test['label']==1).sum()}")

print("\nPolitiFact:")
print(f"  Fake (0): {(df_politifact['label']==0).sum()}")
print(f"  Real (1): {(df_politifact['label']==1).sum()}")

print("\nGossipCop:")
print(f"  Fake (0): {(df_gossipcop['label']==0).sum()}")
print(f"  Real (1): {(df_gossipcop['label']==1).sum()}")


CHECKING CLASS IMBALANCE

Kaggle Test:
  Fake (0): 3491
  Real (1): 4238

PolitiFact:
  Fake (0): 316
  Real (1): 308

GossipCop:
  Fake (0): 3390
  Real (1): 11159


In [13]:
# mixed-dataset training 

print("\n" + "="*80)
print("TRAINING ON MIXED DATASET")
print("="*80 + "\n")

# take samples from each dataset
kaggle_sample = df_train.sample(n=20000, random_state=42)
politifact_sample = df_politifact.sample(frac=0.5, random_state=42)
gossipcop_sample = df_gossipcop.sample(frac=0.5, random_state=42)

# combine them
df_mixed = pd.concat([kaggle_sample, politifact_sample, gossipcop_sample])
df_mixed = df_mixed.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Mixed training set: {len(df_mixed)} articles")
print(f"  Kaggle: {len(kaggle_sample)}")
print(f"  PolitiFact: {len(politifact_sample)}")
print(f"  GossipCop: {len(gossipcop_sample)}")

# convert to dataset and tokenize
mixed_ds = Dataset.from_pandas(df_mixed)
tokenized_mixed = mixed_ds.map(tokenize, batched=True, remove_columns=['text'])

print("\nüöÄ Training new model on mixed data...")

# create a new model
model_mixed = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

# create a new trainer
trainer_mixed = Trainer(
    model=model_mixed,
    args=training_args,
    train_dataset=tokenized_mixed,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_mixed.train()


TRAINING ON MIXED DATASET

Mixed training set: 27586 articles
  Kaggle: 20000
  PolitiFact: 312
  GossipCop: 7274


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27586/27586 [00:08<00:00, 3148.17 examples/s]



üöÄ Training new model on mixed data...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_mixed = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1034,0.007373,0.998318,0.998302
2,0.0628,0.00735,0.998577,0.998563




TrainOutput(global_step=3450, training_loss=0.12486363535342009, metrics={'train_runtime': 11391.816, 'train_samples_per_second': 4.843, 'train_steps_per_second': 0.303, 'total_flos': 3654245659324416.0, 'train_loss': 0.12486363535342009, 'epoch': 2.0})

In [14]:
# testing mixed model 
print("\n" + "="*80)
print("MIXED MODEL RESULTS")
print("="*80 + "\n")

# test on all three datasets
kaggle_mixed = trainer_mixed.evaluate(tokenized_test)
politifact_mixed = trainer_mixed.evaluate(tokenized_politifact)
gossipcop_mixed = trainer_mixed.evaluate(tokenized_gossipcop)

print(f"Kaggle:     {kaggle_mixed['eval_accuracy']*100:.2f}%")
print(f"PolitiFact: {politifact_mixed['eval_accuracy']*100:.2f}%")
print(f"GossipCop:  {gossipcop_mixed['eval_accuracy']*100:.2f}%")

# show comparison
print("\n" + "="*80)
print("BEFORE vs AFTER")
print("="*80 + "\n")

comparison = pd.DataFrame({
    'Dataset': ['Kaggle', 'PolitiFact', 'GossipCop'],
    'Original Model': ['99.91%', '52.88%', '24.67%'],
    'Mixed Model': [
        f"{kaggle_mixed['eval_accuracy']*100:.2f}%",
        f"{politifact_mixed['eval_accuracy']*100:.2f}%",
        f"{gossipcop_mixed['eval_accuracy']*100:.2f}%"
    ],
    'Classical ML Best': ['99.72%', '57.69%', '26.83%']
})

print(comparison.to_string(index=False))

# calculate improvements
politifact_improvement = (politifact_mixed['eval_accuracy'] - 0.5288) * 100
gossipcop_improvement = (gossipcop_mixed['eval_accuracy'] - 0.2467) * 100

print(f"\n Improvement from mixed training:")
print(f"   PolitiFact: {politifact_improvement:+.2f} percentage points")
print(f"   GossipCop:  {gossipcop_improvement:+.2f} percentage points")


MIXED MODEL RESULTS







Kaggle:     99.86%
PolitiFact: 79.01%
GossipCop:  88.61%

BEFORE vs AFTER

   Dataset Original Model Mixed Model Classical ML Best
    Kaggle         99.91%      99.86%            99.72%
PolitiFact         52.88%      79.01%            57.69%
 GossipCop         24.67%      88.61%            26.83%

 Improvement from mixed training:
   PolitiFact: +26.13 percentage points
   GossipCop:  +63.94 percentage points
