In [1]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pipeline import train_logreg, eval_on_dataset, clean_dataset, combine_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Read Data

In [2]:
combine_dataset()

df = pd.read_csv("datasets/processed/politifact_combined.csv")

df.head()

Saved combined dataset to: datasets/processed/politifact_combined.csv
Shape: (1097, 6)


Unnamed: 0,id,news_url,title,tweet_ids,text,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,"The West Texas Federal Appeals Court, operatin...",0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,Share on Twitter Share on Facebook Share on Go...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,Age of Wonders 4 Thrones of Blood RUNE Free Do...,0


## Preprocess/clean data

In [3]:
df = clean_dataset(df)

df.head()

Unnamed: 0,id,news_url,title,tweet_ids,text,label
0,politifact14827,http://www.thegatewaypundit.com/2017/12/called...,we called it gloria allred accuser admits she ...,939169831374524417\t939170337639600129\t939170...,in a last ditch effort to take down republican...,0
1,politifact14920,https://www.ecfr.gov/cgi-bin/text-idx?gp=&SID=...,ecfr code of federal regulations,1000393413857038337\t1000539788850548737\t1000...,request access due to aggressive automated scr...,1
2,politifact1084,http://www.senate.gov/pagelayout/reference/nom...,yous senate supreme court nominations present,1672463074\t1810127907\t7887282655\t1444596837...,use this sites search or visit the senate inde...,1
3,politifact15354,https://exclusive103.com/2018/04/30/thousands-...,thousands killed as israel drops tactical nucl...,990880029549694976\t992661084896219136,an israeli airplane in syria has dropped the f...,0
4,politifact14794,https://web.archive.org/web/20171127001649/htt...,singer tina turner this thanksgiving is the fi...,934060170908504064\t934207140243353602\t934214...,are you ready get it now increase more than of...,0


In [4]:
df.shape

(624, 6)

In [5]:
df["combined"] = (
    df["title"].fillna("") 
    + " [TITLE] " 
    + df["text"].fillna("")
)

## Baseline Logistic Regression Model

In [6]:
model = joblib.load("joblist/logreg_model.pkl")
vectorizer = joblib.load("joblist/logreg_vectorizer_kaggle.pkl")

X_pf = df["combined"]
y_pf = df["label"]

pf_acc, pf_report = eval_on_dataset(model, vectorizer, X_pf, y_pf)
print("=== LR Results ===")
print("Accuracy:", pf_acc)
print("\nClassification Report:")
print(classification_report(y_pf, model.predict(vectorizer.transform(X_pf))))

=== LR Results ===
Accuracy: 0.5416666666666666

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.93      0.67       316
           1       0.67      0.14      0.23       308

    accuracy                           0.54       624
   macro avg       0.60      0.54      0.45       624
weighted avg       0.60      0.54      0.46       624



## Support Vector Machine (SVM)

In [7]:
model = joblib.load("joblist/svm_model.pkl")
vectorizer = joblib.load("joblist/svm_vectorizer_kaggle.pkl")

X_pf = df["text"]
y_pf = df["label"]

pf_acc, pf_report = eval_on_dataset(model, vectorizer, X_pf, y_pf)
print("PolitiFact accuracy:", pf_acc)

PolitiFact accuracy: 0.5336538461538461


## LSTM