In [15]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pipeline import append_text_column, combine_dataset, train_logreg, train_svm, eval_on_dataset, clean_dataset 

## Read Data

In [16]:
df = pd.read_csv("datasets/processed/gossipcop_combined.csv")

df.head()

Unnamed: 0,id,news_url,title,tweet_ids,text,label
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...,Congratulations might be in order for Miley Cy...,0
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...,,0
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...,Thousands are taking the streets to protest Pr...,0
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...,We'd venture to say that Cindy Crawford's daug...,0
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...,Oscar nominations for the 90th annual awards w...,0


## Preprocess/clean dataset

In [17]:
df.drop(columns=['tweet_ids'], inplace=True)
df = df.dropna(subset=["text"])                
df.head()


Unnamed: 0,id,news_url,title,text,label
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,Congratulations might be in order for Miley Cy...,0
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,Thousands are taking the streets to protest Pr...,0
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,We'd venture to say that Cindy Crawford's daug...,0
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,Oscar nominations for the 90th annual awards w...,0
5,gossipcop-5189580095,www.townandcountrymag.com/society/tradition/a1...,Here's What Really Happened When JFK Jr. Met P...,"During the summer of 1995, John F. Kennedy Jr....",0


In [18]:
df = clean_dataset(df)
df.head()

Unnamed: 0,id,news_url,title,text,label
0,gossipcop-910806,https://www.usatoday.com/story/life/people/201...,lawsuit against jim carrey from exgirlfriends ...,jim carrey is no longer facing a lawsuit from ...,1
1,gossipcop-5098934079,www.dailymail.co.uk/tvshowbiz/article-5217017/...,miley cyrus and liam hemsworth to start a fami...,they met on the set of the film the last song ...,0
2,gossipcop-848557,http://www.mtv.com/movie-and-tv-awards,movie tv awards,mtv movie tv awards the mtv movie tv awards ho...,1
3,gossipcop-910731,https://www.dailymail.co.uk/tvshowbiz/article-...,anne hathaway teases josh gad about social med...,anne hathaway and josh gad had a laugh on soci...,1
4,gossipcop-3776508422,www.nme.com/news/music/justin-bieber-selena-go...,it looks like justin bieber and selena gomez a...,justin bieber and selena gomez appear to have ...,0


In [19]:
df.shape

(14549, 5)

In [20]:
df["combined"] = (
    df["title"].fillna("") 
    + " [TITLE] " 
    + df["text"].fillna("")
)

## Baseline Logistic Regression Model

In [22]:
model = joblib.load("joblist/logreg_model.pkl")
vectorizer = joblib.load("joblist/logreg_vectorizer_kaggle.pkl")

X_gc = df["combined"]
y_gc = df["label"]

gc_acc, gc_report = eval_on_dataset(model, vectorizer, X_gc, y_gc)
print("=== LR Results ===")
print("Accuracy:", gc_acc)
print("\nClassification Report:")
print(classification_report(y_gc, model.predict(vectorizer.transform(X_gc))))

=== LR Results ===
Accuracy: 0.25039521616605953

Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.98      0.38      3390
           1       0.83      0.03      0.06     11159

    accuracy                           0.25     14549
   macro avg       0.53      0.50      0.22     14549
weighted avg       0.69      0.25      0.13     14549



## Support Vector Machine (SVM) Model

In [23]:
svm_model = joblib.load("joblist/svm_model.pkl")
svm_vectorizer = joblib.load("joblist/svm_vectorizer_kaggle.pkl")

X_gc = df["combined"]
y_gc = df["label"]

svm_gc_acc, svm_gc_report = eval_on_dataset(svm_model, svm_vectorizer, X_gc, y_gc)
print("=== SVM Results on GossipCop ===")
print("GossipCop accuracy:", svm_gc_acc)
print("\nClassification Report:")
print(classification_report(y_gc, svm_model.predict(svm_vectorizer.transform(X_gc))))

=== SVM Results on GossipCop ===
GossipCop accuracy: 0.2682658601965771

Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.96      0.38      3390
           1       0.83      0.06      0.11     11159

    accuracy                           0.27     14549
   macro avg       0.53      0.51      0.24     14549
weighted avg       0.69      0.27      0.17     14549



## Model Comparison on GossipCop


In [24]:
# Compare both models on GossipCop dataset
print("=== Model Comparison on GossipCop Dataset ===")
print(f"Logistic Regression Accuracy: {gc_acc:.4f}")
print(f"SVM Accuracy: {svm_gc_acc:.4f}")
print(f"\nSVM improvement: {svm_gc_acc - gc_acc:.4f} ({((svm_gc_acc - gc_acc) / gc_acc * 100):.2f}%)")
print(f"\nNote: Both models were trained on Kaggle dataset and tested on GossipCop")


=== Model Comparison on GossipCop Dataset ===
Logistic Regression Accuracy: 0.2504
SVM Accuracy: 0.2683

SVM improvement: 0.0179 (7.14%)

Note: Both models were trained on Kaggle dataset and tested on GossipCop
