# **SUPPORT VECTOR MACHINE**

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # "0" o "1"

In [5]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, confusion_matrix

#### DATASET

In [7]:
# -------------
# Dataset paths
# -------------

DATA_PATH_CELEBRITY = "datasets/Celebrity/df_celebrity.csv"
DATA_PATH_CIDII = "datasets/CIDII/df_cidii.csv"
DATA_PATH_FAKES = "datasets/FaKES/df_fakes.csv"
DATA_PATH_FAKEVSATIRE = "datasets/FakeVsSatire/df_fakevssatire.csv"
DATA_PATH_HORNE = "datasets/Horne/df_horne_all.csv"
DATA_PATH_INFODEMIC = "datasets/Infodemic/df_infodemic.csv"
DATA_PATH_ISOT = "datasets/ISOT/df_isot.csv"
DATA_PATH_KAGGLE_CLEMENT = "datasets/Kaggle_clement/df_kaggle_clement_def.csv"
DATA_PATH_KAGGLE_MEG = "datasets/Kaggle_meg/fake.csv"
DATA_PATH_LIAR_PLUS = "datasets/LIAR_PLUS/df_liar_plus_complete.csv"
DATA_PATH_POLITIFACT = "datasets/Politifact/df_politifact.csv"
DATA_PATH_UNIPI_NDF = "datasets/Unipi_NDF/df_ndf.csv"

In [8]:
# ----------------------------
# Load and preprocess datasets
# ----------------------------

# Celebrity
dfCelebrity = pd.read_csv(DATA_PATH_CELEBRITY, sep="\t", encoding="utf-8")
dfCelebrity = dfCelebrity[["texts", "labels"]] # keep only relevant columns
dfCelebrity = dfCelebrity.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# CIDII
dfCidii = pd.read_csv(DATA_PATH_CIDII, sep="\t", encoding="utf-8")
dfCidii = dfCidii[["texts", "labels"]] # keep only relevant columns
dfCidii = dfCidii.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# FaKES
dfFakes = pd.read_csv(DATA_PATH_FAKES, sep="\t", encoding="utf-8")
dfFakes = dfFakes[["texts", "labels"]] # keep only relevant columns
dfFakes = dfFakes.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# FakeVsSatire
dfFakeVsSatire = pd.read_csv(DATA_PATH_FAKEVSATIRE, sep="\t", encoding="utf-8")
dfFakeVsSatire = dfFakeVsSatire[["texts", "labels"]] # keep only relevant columns
dfFakeVsSatire = dfFakeVsSatire.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Horne
dfHorne = pd.read_csv(DATA_PATH_HORNE, sep="\t", encoding="utf-8")
dfHorne = dfHorne[["texts", "labels"]] # keep only relevant columns
dfHorne = dfHorne.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Infodemic
dfInfodemic = pd.read_csv(DATA_PATH_INFODEMIC, sep="\t", encoding="utf-8")
dfInfodemic = dfInfodemic[["texts", "labels"]] # keep only relevant columns
dfInfodemic = dfInfodemic.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# ISOT
dfIsot = pd.read_csv(DATA_PATH_ISOT, sep="\t", encoding="utf-8")
dfIsot = dfIsot[["texts", "labels"]] # keep only relevant columns
dfIsot = dfIsot.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Kaggle_clement
dfKaggleClement = pd.read_csv(DATA_PATH_KAGGLE_CLEMENT, encoding="utf-8")
dfKaggleClement["texts"] = dfKaggleClement["title"].astype(str) + " " + dfKaggleClement["text"].astype(str) # merge title and text
dfKaggleClement = dfKaggleClement[["texts", "labels"]] # keep only relevant columns
dfKaggleClement = dfKaggleClement.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Kaggle_meg
dfKaggleMeg = pd.read_csv(DATA_PATH_KAGGLE_MEG, encoding="utf-8")
dfKaggleMeg["texts"] = dfKaggleMeg["title"].astype(str) + " " + dfKaggleMeg["text"].astype(str) # merge title and text
dfKaggleMeg["labels"] = dfKaggleMeg["spam_score"].apply(lambda x: 1 if x > 0.5 else 0) # create binary labels based on spam_score
dfKaggleMeg = dfKaggleMeg[["texts", "labels"]] # keep only relevant columns
dfKaggleMeg = dfKaggleMeg.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# LIAR_PLUS
dfLiarPlus = pd.read_csv(DATA_PATH_LIAR_PLUS, sep="\t", encoding="utf-8")
dfLiarPlus = dfLiarPlus[["texts", "labels"]] # keep only relevant columns
dfLiarPlus = dfLiarPlus.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Politifact
dfPolitifact = pd.read_csv(DATA_PATH_POLITIFACT, sep="\t", encoding="utf-8")
dfPolitifact = dfPolitifact[["texts", "labels"]] # keep only relevant columns
dfPolitifact = dfPolitifact.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Unipi_NDF
dfNDF = pd.read_csv(DATA_PATH_UNIPI_NDF, sep="\t", encoding="utf-8")
dfNDF = dfNDF[["texts", "labels"]] # keep only relevant columns
dfNDF = dfNDF.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN

## VERSION 1: Fine-Tuning Only

In [9]:
# --------------------------
# Dataset splitting function
# --------------------------

def split_dataset(df, test_size=0.2, val_size=0.2, random_state=42):
    X = df["texts"].astype(str)
    y = df["labels"]

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(test_size + val_size), stratify=y, random_state=random_state
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=random_state
    )

    return {
        "train": (X_train, y_train),
        "val": (X_val, y_val),
        "test": (X_test, y_test)
    }

In [10]:
# -----------------------
# Model building function
# -----------------------

def build_model(C=10.0, penalty="l1", loss="squared_hinge", dual=False):
    """
    Builds a scikit-learn Pipeline with TF-IDF vectorization and SVM classifier.

    Args:
        C (float): Inverse of regularization strength for SVM.
        penalty (str): Regularization type for SVM.
        loss (str): Loss function to use for SVM.
        dual (bool): Whether to solve the dual optimization problem.

    Returns:
        Pipeline: A scikit-learn Pipeline object (TF-IDF + SVM).
    """
    
    return Pipeline([
        ('tfidf', TfidfVectorizer(  # TD-IDF vectorization inside pipeline
            max_features=5000,      # limit to top 5000 features
            ngram_range=(1, 2),     # unigrams + bigrams
            stop_words="english"    # remove English stop words
        )),
        ('clf', LinearSVC(         # Linear SVM classifier
            C=C,
            penalty=penalty,
            loss=loss,
            dual=dual,
            max_iter=2000,
            random_state=42
        ))
    ])

In [11]:
# --------------------------------
# Training and evaluation function
# --------------------------------

def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    """
    Trains the model and evaluates it on the validation set using weighted F1-score.

    Args:
        model (Pipeline): The scikit-learn Pipeline model to train.
        X_train (array-like): Training features.
        y_train (array-like): Training labels.
        X_val (array-like): Validation features.
        y_val (array-like): Validation labels.

    Returns:
        float: Weighted F1-score on the validation set.
    """

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average="weighted") # weighted F1-score: average for label imbalance
    return f1


In [12]:
# --------------------------------
# Fine-tuning on multiple datasets
# --------------------------------

datasets_df = {
    "Celebrity": dfCelebrity,
    "CIDII": dfCidii,
    "FaKES": dfFakes,
    "FakeVsSatire": dfFakeVsSatire,
    "Horne": dfHorne,
    "Infodemic": dfInfodemic,
    "ISOT": dfIsot,
    "Kaggle_clement": dfKaggleClement,
    "Kaggle_meg": dfKaggleMeg,
    "LIAR_PLUS": dfLiarPlus,
    "Politifact": dfPolitifact,
    "Unipi_NDF": dfNDF
}

datasets = {name: split_dataset(df) for name, df in datasets_df.items()} # split all datasets
model = build_model() # initialize model

results = {}

# sequential training
for i, (name, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on {name} ===")
    
    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # fine-tune on train + val

    y_pred = model.predict(X_test)
    print(f"Classification Report after {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after {name}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nWeighted F1-score after {name}:", f1_score(y_test, y_pred, average="weighted"))


    # evaluation on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[name] = {}
    for test_name, test_data in datasets.items(): # for each dataset
        X_te, y_te = test_data["test"]
        preds = model.predict(X_te)
        f1 = f1_score(y_te, preds, average="weighted")
        results[name][test_name] = f1
        print(f"Evaluation on {test_name}: Weighted F1 = {f1:.4f}")



=== Phase 1: Training/Fine-tuning on Celebrity ===




Classification Report after Celebrity:
              precision    recall  f1-score   support

           0       0.58      0.68      0.62        50
           1       0.61      0.50      0.55        50

    accuracy                           0.59       100
   macro avg       0.59      0.59      0.59       100
weighted avg       0.59      0.59      0.59       100

Confusion Matrix after Celebrity:
[[34 16]
 [25 25]]

Weighted F1-score after Celebrity: 0.5866518802298619

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.5867
Evaluation on CIDII: Weighted F1 = 0.5747
Evaluation on FaKES: Weighted F1 = 0.3924
Evaluation on FakeVsSatire: Weighted F1 = 0.4506
Evaluation on Horne: Weighted F1 = 0.6382
Evaluation on Infodemic: Weighted F1 = 0.4439
Evaluation on ISOT: Weighted F1 = 0.4919
Evaluation on Kaggle_clement: Weighted F1 = 0.4727
Evaluation on Kaggle_meg: Weighted F1 = 0.8621
Evaluation on LIAR_PLUS: Weighted F1 = 0.4593
Evaluation on Politifact: Weighted F1 



Classification Report after FaKES:
              precision    recall  f1-score   support

           0       0.48      0.52      0.50        86
           1       0.40      0.36      0.38        75

    accuracy                           0.45       161
   macro avg       0.44      0.44      0.44       161
weighted avg       0.44      0.45      0.44       161

Confusion Matrix after FaKES:
[[45 41]
 [48 27]]

Weighted F1-score after FaKES: 0.44448386202090356

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.5146
Evaluation on CIDII: Weighted F1 = 0.5388
Evaluation on FaKES: Weighted F1 = 0.4445
Evaluation on FakeVsSatire: Weighted F1 = 0.3619
Evaluation on Horne: Weighted F1 = 0.4799
Evaluation on Infodemic: Weighted F1 = 0.4648
Evaluation on ISOT: Weighted F1 = 0.4218
Evaluation on Kaggle_clement: Weighted F1 = 0.4157
Evaluation on Kaggle_meg: Weighted F1 = 0.8099
Evaluation on LIAR_PLUS: Weighted F1 = 0.5101
Evaluation on Politifact: Weighted F1 = 0.4973
Ev



Classification Report after FakeVsSatire:
              precision    recall  f1-score   support

           0       0.77      0.66      0.71        41
           1       0.78      0.86      0.82        57

    accuracy                           0.78        98
   macro avg       0.77      0.76      0.76        98
weighted avg       0.78      0.78      0.77        98

Confusion Matrix after FakeVsSatire:
[[27 14]
 [ 8 49]]

Weighted F1-score after FakeVsSatire: 0.7722610096670247

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.4858
Evaluation on CIDII: Weighted F1 = 0.3953
Evaluation on FaKES: Weighted F1 = 0.4735
Evaluation on FakeVsSatire: Weighted F1 = 0.7723
Evaluation on Horne: Weighted F1 = 0.5961
Evaluation on Infodemic: Weighted F1 = 0.3526
Evaluation on ISOT: Weighted F1 = 0.4438
Evaluation on Kaggle_clement: Weighted F1 = 0.4504
Evaluation on Kaggle_meg: Weighted F1 = 0.4735
Evaluation on LIAR_PLUS: Weighted F1 = 0.4095
Evaluation on Politifact: Wei



Classification Report after Horne:
              precision    recall  f1-score   support

           0       0.80      0.90      0.85        41
           1       0.80      0.64      0.71        25

    accuracy                           0.80        66
   macro avg       0.80      0.77      0.78        66
weighted avg       0.80      0.80      0.80        66

Confusion Matrix after Horne:
[[37  4]
 [ 9 16]]

Weighted F1-score after Horne: 0.7977475908510391

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.5806
Evaluation on CIDII: Weighted F1 = 0.6254
Evaluation on FaKES: Weighted F1 = 0.4370
Evaluation on FakeVsSatire: Weighted F1 = 0.6646
Evaluation on Horne: Weighted F1 = 0.7977
Evaluation on Infodemic: Weighted F1 = 0.4667
Evaluation on ISOT: Weighted F1 = 0.5524
Evaluation on Kaggle_clement: Weighted F1 = 0.5523
Evaluation on Kaggle_meg: Weighted F1 = 0.6917
Evaluation on LIAR_PLUS: Weighted F1 = 0.4951
Evaluation on Politifact: Weighted F1 = 0.6395
Eva



Classification Report after Kaggle_meg:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2545
           1       0.20      0.20      0.20        55

    accuracy                           0.97      2600
   macro avg       0.59      0.59      0.59      2600
weighted avg       0.97      0.97      0.97      2600

Confusion Matrix after Kaggle_meg:
[[2500   45]
 [  44   11]]

Weighted F1-score after Kaggle_meg: 0.9659200602116697

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.3552
Evaluation on CIDII: Weighted F1 = 0.4784
Evaluation on FaKES: Weighted F1 = 0.3857
Evaluation on FakeVsSatire: Weighted F1 = 0.2425
Evaluation on Horne: Weighted F1 = 0.4761
Evaluation on Infodemic: Weighted F1 = 0.4221
Evaluation on ISOT: Weighted F1 = 0.3333
Evaluation on Kaggle_clement: Weighted F1 = 0.3254
Evaluation on Kaggle_meg: Weighted F1 = 0.9659
Evaluation on LIAR_PLUS: Weighted F1 = 0.4140
Evaluation on Politifact: W



Classification Report after Politifact:
              precision    recall  f1-score   support

           0       0.79      0.95      0.87        65
           1       0.87      0.56      0.68        36

    accuracy                           0.81       101
   macro avg       0.83      0.75      0.77       101
weighted avg       0.82      0.81      0.80       101

Confusion Matrix after Politifact:
[[62  3]
 [16 20]]

Weighted F1-score after Politifact: 0.7997070893530032

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.4179
Evaluation on CIDII: Weighted F1 = 0.5136
Evaluation on FaKES: Weighted F1 = 0.5376
Evaluation on FakeVsSatire: Weighted F1 = 0.4824
Evaluation on Horne: Weighted F1 = 0.6848
Evaluation on Infodemic: Weighted F1 = 0.5102
Evaluation on ISOT: Weighted F1 = 0.5614
Evaluation on Kaggle_clement: Weighted F1 = 0.5507
Evaluation on Kaggle_meg: Weighted F1 = 0.8192
Evaluation on LIAR_PLUS: Weighted F1 = 0.4452
Evaluation on Politifact: Weighted 

In [13]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for name, res in results.items():
    print(f"\nResults after training on {name}:")
    for test_name, f1 in res.items():
        print(f"  Test on {test_name}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on Celebrity:
  Test on Celebrity: Weighted F1 = 0.5867
  Test on CIDII: Weighted F1 = 0.5747
  Test on FaKES: Weighted F1 = 0.3924
  Test on FakeVsSatire: Weighted F1 = 0.4506
  Test on Horne: Weighted F1 = 0.6382
  Test on Infodemic: Weighted F1 = 0.4439
  Test on ISOT: Weighted F1 = 0.4919
  Test on Kaggle_clement: Weighted F1 = 0.4727
  Test on Kaggle_meg: Weighted F1 = 0.8621
  Test on LIAR_PLUS: Weighted F1 = 0.4593
  Test on Politifact: Weighted F1 = 0.5542
  Test on Unipi_NDF: Weighted F1 = 0.4245

Results after training on CIDII:
  Test on Celebrity: Weighted F1 = 0.3967
  Test on CIDII: Weighted F1 = 0.8663
  Test on FaKES: Weighted F1 = 0.3634
  Test on FakeVsSatire: Weighted F1 = 0.4202
  Test on Horne: Weighted F1 = 0.6812
  Test on Infodemic: Weighted F1 = 0.4553
  Test on ISOT: Weighted F1 = 0.4186
  Test on Kaggle_clement: Weighted F1 = 0.4175
  Test on Kaggle_meg: Weighted F1 = 0.9074
  Test on LIAR_PLUS: Weighted F1 = 0