In [1]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pipeline import append_text_column, combine_dataset, train_logreg, eval_on_dataset, clean_dataset 

  from .autonotebook import tqdm as notebook_tqdm


## Read Data

In [2]:
df = pd.read_csv("datasets/processed/gossipcop_combined.csv")

df.head()

Unnamed: 0,id,news_url,title,tweet_ids,text,label
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...,Congratulations might be in order for Miley Cy...,0
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...,,0
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...,Thousands are taking the streets to protest Pr...,0
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...,We'd venture to say that Cindy Crawford's daug...,0
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...,Oscar nominations for the 90th annual awards w...,0


## Preprocess/clean dataset

In [3]:
df.drop(columns=['tweet_ids'], inplace=True)
df = df.dropna(subset=["text"])                
df.head()


Unnamed: 0,id,news_url,title,text,label
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,Congratulations might be in order for Miley Cy...,0
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,Thousands are taking the streets to protest Pr...,0
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,We'd venture to say that Cindy Crawford's daug...,0
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,Oscar nominations for the 90th annual awards w...,0
5,gossipcop-5189580095,www.townandcountrymag.com/society/tradition/a1...,Here's What Really Happened When JFK Jr. Met P...,"During the summer of 1995, John F. Kennedy Jr....",0


In [4]:
df = clean_dataset(df)
df.head()

Unnamed: 0,id,news_url,title,text,label
0,gossipcop-943314,https://www.usmagazine.com/celebrity-moms/news...,hilary duffs son luca chose a unique name for ...,hilary duffs babytobe could be named cafont cr...,1
1,gossipcop-926184,https://www.elle.com/culture/movies-tv/a191364...,of the most shocking revelations from bachelor...,every item on this page was chosen by an elle ...,1
2,gossipcop-888831,https://www.newbeauty.com/slideshow/2070-the-k...,kim khlo and kourtney kardashian beauty,the kardashian clan live to be pampered and pr...,1
3,gossipcop-844154,https://ew.com/recap/the-magicians-season-1-fi...,the magicians finale recap have you brought me...,we are all the protagonists of our own lives b...,1
4,gossipcop-866133,https://www.independent.co.uk/arts-entertainme...,stranger things season eleven is back in final...,your support helps us to tell the story read m...,1


In [5]:
df.shape

(14549, 5)

In [6]:
df["combined"] = (
    df["title"].fillna("") 
    + " [TITLE] " 
    + df["text"].fillna("")
)

## Baseline Logistic Regression Model

In [7]:
model = joblib.load("joblist/logreg_model.pkl")
vectorizer = joblib.load("joblist/logreg_vectorizer_kaggle.pkl")

X_gc = df["combined"]
y_gc = df["label"]

gc_acc, gc_report = eval_on_dataset(model, vectorizer, X_gc, y_gc)
print("=== LR Results ===")
print("Accuracy:", gc_acc)
print("\nClassification Report:")
print(classification_report(y_gc, model.predict(vectorizer.transform(X_gc))))

=== LR Results ===
Accuracy: 0.2484019520241941

Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.98      0.38      3390
           1       0.82      0.03      0.05     11159

    accuracy                           0.25     14549
   macro avg       0.53      0.50      0.21     14549
weighted avg       0.68      0.25      0.13     14549



## Support Vector Machine (SVM) Model

In [8]:
model = joblib.load("joblist/svm_model.pkl")
vectorizer = joblib.load("joblist/svm_vectorizer_kaggle.pkl")

X_gc = df["text"]
y_gc = df["label"]

gc_acc, gc_report = eval_on_dataset(model, vectorizer, X_gc, y_gc)
print("GossipCop accuracy:", gc_acc)
print("GossipCop Classification Report:")
print(classification_report(y_gc, model.predict(vectorizer.transform(X_gc))))

GossipCop accuracy: 0.26936559213691663
GossipCop Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.96      0.38      3390
           1       0.83      0.06      0.11     11159

    accuracy                           0.27     14549
   macro avg       0.53      0.51      0.25     14549
weighted avg       0.69      0.27      0.17     14549

