In [13]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pipeline import train_logreg, eval_on_dataset, clean_dataset, combine_politifact

## Read Data

In [6]:
combine_politifact()

df = pd.read_csv("datasets/processed/politifact_combined.csv")

df.head()

Saved combined Politifact dataset to: datasets/processed/politifact_combined.csv
Shape: (1097, 6)


Unnamed: 0,id,news_url,title,tweet_ids,text,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,"The West Texas Federal Appeals Court, operatin...",0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,Share on Twitter Share on Facebook Share on Go...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,Age of Wonders 4 Thrones of Blood RUNE Free Do...,0


## Preprocess/clean data

In [7]:
df = clean_dataset(df)

df.head()

Unnamed: 0,id,news_url,title,tweet_ids,text,label
0,politifact7258,https://web.archive.org/web/20121222121141/htt...,shields and gerson on cabinet noms gun laws bo...,651615424359059456,judy woodruff and now to the analysis of shiel...,1
1,politifact5237,http://www.politifact.com/virginia/statements/...,morgan griffith says epa treats milk spills sa...,46251767318589440\t46269888616075265\t46348907...,yous rep h morgan griffith is milking his oppo...,1
2,politifact14394,https://web.archive.org/web/20170731052757/htt...,breaking hillary clinton has third heart attac...,,hillary clinton had a third and mostlikely fat...,0
3,politifact14827,http://www.thegatewaypundit.com/2017/12/called...,we called it gloria allred accuser admits she ...,939169831374524417\t939170337639600129\t939170...,in a last ditch effort to take down republican...,0
4,politifact12556,https://web.archive.org/web/20160709050756/htt...,donald j trump on the stakes of the election,708116187549474816\t708482174400512000\t708495...,june donald j trump on the stakes of the elect...,1


In [8]:
df.shape

(624, 6)

## Baseline Logistic Regression Model

In [12]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.95, random_state=42
)

In [16]:
model = joblib.load("joblist/logreg_model.pkl")
vectorizer = joblib.load("joblist/vectorizer_kaggle.pkl")

X_pf = df["text"]
y_pf = df["label"]

pf_acc, pf_report = eval_on_dataset(model, vectorizer, X_pf, y_pf)
print("PolitiFact accuracy:", pf_acc)

PolitiFact accuracy: 0.5689102564102564


In [17]:
print(classification_report(y_pf, model.predict(vectorizer.transform(X_pf))))

              precision    recall  f1-score   support

           0       0.55      0.88      0.67       316
           1       0.67      0.25      0.36       308

    accuracy                           0.57       624
   macro avg       0.61      0.56      0.52       624
weighted avg       0.61      0.57      0.52       624

