## Mixed Dataset Training: 90% Kaggle + 10% PolitiFact

In [None]:
#import libraries

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import importlib
import pipeline
importlib.reload(pipeline)
from pipeline import load_kaggle, load_politifact, load_gossipcop, clean_dataset, train_logreg, train_svm, train_nb, eval_on_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Data


In [12]:
kaggle_df = load_kaggle()
kaggle_df = clean_dataset(kaggle_df)
kaggle_df["combined"] = (
    kaggle_df["title"].fillna("") 
    + " [TITLE] " 
    + kaggle_df["text"].fillna("")
)
print(f"Kaggle: {len(kaggle_df)} samples")


Kaggle: 38644 samples


In [13]:
politifact_df = load_politifact()
politifact_df = clean_dataset(politifact_df)
politifact_df["combined"] = (
    politifact_df["title"].fillna("") 
    + " [TITLE] " 
    + politifact_df["text"].fillna("")
)
print(f"PolitiFact: {len(politifact_df)} samples")


Loading PolitiFact from: data_files/processed/politifact_combined.csv
PolitiFact: 624 samples


In [14]:
gossipcop_df = load_gossipcop()
gossipcop_df = clean_dataset(gossipcop_df)
gossipcop_df["combined"] = (
    gossipcop_df["title"].fillna("") 
    + " [TITLE] " 
    + gossipcop_df["text"].fillna("")
)
print(f"GossipCop: {len(gossipcop_df)} samples")


Loading GossipCop from: data_files/processed/gossipcop_combined.csv
GossipCop: 14549 samples


## Create 90% Kaggle + 10% PolitiFact Training Split


In [15]:
pf_train, pf_test = train_test_split(
    politifact_df, 
    test_size=0.9,
    random_state=42,
    stratify=politifact_df["label"]
)

pf_train_size = len(pf_train)
kaggle_train_size = pf_train_size * 9

kaggle_shuffled = kaggle_df.sample(frac=1, random_state=42).reset_index(drop=True)
kaggle_train = kaggle_shuffled.head(kaggle_train_size)

mixed_train = pd.concat([kaggle_train, pf_train], ignore_index=True)

X_mixed_train = mixed_train["combined"]
y_mixed_train = mixed_train["label"]

X_pf_test = pf_test["combined"]
y_pf_test = pf_test["label"]

X_gc_test = gossipcop_df["combined"]
y_gc_test = gossipcop_df["label"]

print(f"Mixed train: {len(mixed_train)} samples")
print(f"PolitiFact test: {len(X_pf_test)} samples")
print(f"GossipCop test: {len(X_gc_test)} samples")


Mixed train: 620 samples
PolitiFact test: 562 samples
GossipCop test: 14549 samples


## Train Models


### Logistic Regression


In [16]:
print("=== Training Logistic Regression on Mixed Dataset ===")
lr_mixed_model, lr_mixed_vectorizer = train_logreg(X_mixed_train, y_mixed_train)
print("LR trained")


=== Training Logistic Regression on Mixed Dataset ===
LR trained


### SVM


In [17]:
print("=== Training SVM on Mixed Dataset ===")
svm_mixed_model, svm_mixed_vectorizer = train_svm(X_mixed_train, y_mixed_train)
print("SVM trained")


=== Training SVM on Mixed Dataset ===
SVM trained


### Naive Bayes


In [18]:
print("=== Training Naive Bayes on Mixed Dataset ===")
nb_mixed_model, nb_mixed_vectorizer = train_nb(X_mixed_train, y_mixed_train)
print("NB trained")


=== Training Naive Bayes on Mixed Dataset ===
NB trained


## Load Baseline Models


In [19]:
lr_kaggle_model = joblib.load("joblist/logreg_model.pkl")
lr_kaggle_vectorizer = joblib.load("joblist/logreg_vectorizer_kaggle.pkl")

svm_kaggle_model = joblib.load("joblist/svm_model.pkl")
svm_kaggle_vectorizer = joblib.load("joblist/svm_vectorizer_kaggle.pkl")

nb_kaggle_model = joblib.load("joblist/nb_model.pkl")
nb_kaggle_vectorizer = joblib.load("joblist/nb_vectorizer_kaggle.pkl")


## Evaluate on PolitiFact


In [20]:
lr_pf_acc, _ = eval_on_dataset(lr_mixed_model, lr_mixed_vectorizer, X_pf_test, y_pf_test)
svm_pf_acc, _ = eval_on_dataset(svm_mixed_model, svm_mixed_vectorizer, X_pf_test, y_pf_test)
nb_pf_acc, _ = eval_on_dataset(nb_mixed_model, nb_mixed_vectorizer, X_pf_test, y_pf_test)

lr_kaggle_pf_acc, _ = eval_on_dataset(lr_kaggle_model, lr_kaggle_vectorizer, X_pf_test, y_pf_test)
svm_kaggle_pf_acc, _ = eval_on_dataset(svm_kaggle_model, svm_kaggle_vectorizer, X_pf_test, y_pf_test)
nb_kaggle_pf_acc, _ = eval_on_dataset(nb_kaggle_model, nb_kaggle_vectorizer, X_pf_test, y_pf_test)

print("=== PolitiFact Results ===")
print(f"LR - Kaggle: {lr_kaggle_pf_acc:.4f}, Mixed: {lr_pf_acc:.4f}")
print(f"SVM - Kaggle: {svm_kaggle_pf_acc:.4f}, Mixed: {svm_pf_acc:.4f}")
print(f"NB - Kaggle: {nb_kaggle_pf_acc:.4f}, Mixed: {nb_pf_acc:.4f}")


=== PolitiFact Results ===
LR - Kaggle: 0.5409, Mixed: 0.5925
SVM - Kaggle: 0.5196, Mixed: 0.6299
NB - Kaggle: 0.5623, Mixed: 0.5890


## Evaluate on GossipCop


In [21]:
lr_gc_acc, _ = eval_on_dataset(lr_mixed_model, lr_mixed_vectorizer, X_gc_test, y_gc_test)
svm_gc_acc, _ = eval_on_dataset(svm_mixed_model, svm_mixed_vectorizer, X_gc_test, y_gc_test)
nb_gc_acc, _ = eval_on_dataset(nb_mixed_model, nb_mixed_vectorizer, X_gc_test, y_gc_test)

lr_kaggle_gc_acc, _ = eval_on_dataset(lr_kaggle_model, lr_kaggle_vectorizer, X_gc_test, y_gc_test)
svm_kaggle_gc_acc, _ = eval_on_dataset(svm_kaggle_model, svm_kaggle_vectorizer, X_gc_test, y_gc_test)
nb_kaggle_gc_acc, _ = eval_on_dataset(nb_kaggle_model, nb_kaggle_vectorizer, X_gc_test, y_gc_test)

print("=== GossipCop Results ===")
print(f"LR - Kaggle: {lr_kaggle_gc_acc:.4f}, Mixed: {lr_gc_acc:.4f}")
print(f"SVM - Kaggle: {svm_kaggle_gc_acc:.4f}, Mixed: {svm_gc_acc:.4f}")
print(f"NB - Kaggle: {nb_kaggle_gc_acc:.4f}, Mixed: {nb_gc_acc:.4f}")


=== GossipCop Results ===
LR - Kaggle: 0.2498, Mixed: 0.2901
SVM - Kaggle: 0.2686, Mixed: 0.2838
NB - Kaggle: 0.2507, Mixed: 0.2689


## Comparison


In [23]:
print("=== PolitiFact Results ===")
pf_data = {
    'Model': ['LR', 'SVM', 'NB'],
    'Kaggle-only': [
        f"{lr_kaggle_pf_acc:.4f}",
        f"{svm_kaggle_pf_acc:.4f}",
        f"{nb_kaggle_pf_acc:.4f}"
    ],
    'Mixed (90% Kaggle + 10% PF)': [
        f"{lr_pf_acc:.4f}",
        f"{svm_pf_acc:.4f}",
        f"{nb_pf_acc:.4f}"
    ],
    'Improvement': [
        f"{((lr_pf_acc - lr_kaggle_pf_acc) / lr_kaggle_pf_acc * 100):+.2f}%",
        f"{((svm_pf_acc - svm_kaggle_pf_acc) / svm_kaggle_pf_acc * 100):+.2f}%",
        f"{((nb_pf_acc - nb_kaggle_pf_acc) / nb_kaggle_pf_acc * 100):+.2f}%"
    ]
}
pf_df = pd.DataFrame(pf_data)
print(pf_df.to_string(index=False))
print()

print("=== GossipCop Results ===")
gc_data = {
    'Model': ['LR', 'SVM', 'NB'],
    'Kaggle-only': [
        f"{lr_kaggle_gc_acc:.4f}",
        f"{svm_kaggle_gc_acc:.4f}",
        f"{nb_kaggle_gc_acc:.4f}"
    ],
    'Mixed (90% Kaggle + 10% PF)': [
        f"{lr_gc_acc:.4f}",
        f"{svm_gc_acc:.4f}",
        f"{nb_gc_acc:.4f}"
    ],
    'Improvement': [
        f"{((lr_gc_acc - lr_kaggle_gc_acc) / lr_kaggle_gc_acc * 100):+.2f}%",
        f"{((svm_gc_acc - svm_kaggle_gc_acc) / svm_kaggle_gc_acc * 100):+.2f}%",
        f"{((nb_gc_acc - nb_kaggle_gc_acc) / nb_kaggle_gc_acc * 100):+.2f}%"
    ]
}
gc_df = pd.DataFrame(gc_data)
print(gc_df.to_string(index=False))


=== PolitiFact Results ===
Model Kaggle-only Mixed (90% Kaggle + 10% PF) Improvement
   LR      0.5409                      0.5925      +9.54%
  SVM      0.5196                      0.6299     +21.23%
   NB      0.5623                      0.5890      +4.75%

=== GossipCop Results ===
Model Kaggle-only Mixed (90% Kaggle + 10% PF) Improvement
   LR      0.2498                      0.2901     +16.15%
  SVM      0.2686                      0.2838      +5.66%
   NB      0.2507                      0.2689      +7.27%
