# Fake News Prediction

In [1]:
import itertools
from collections import Counter
from pprint import pprint
from copy import copy
from string import punctuation

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
LabeledSentence = gensim.models.doc2vec.LabeledSentence

## Pre-Processing

In [3]:
# convert stopwords and punctuation to sets for faster lookup
stopwords_lookup = set(stopwords.words('english'))
punctuation_lookup = set(punctuation)
stemmer = PorterStemmer()

def process_text(string_input, punctuation, stopwords, stem=True):
    for character in string_input:
        if character in punctuation:
            string_input = string_input.replace(character, "")

    processed_string = string_input.lower().split()

    processed_string = ' '.join([stemmer.stem(word) for word in processed_string if word not in stopwords])
    
    return processed_string

stopwords_lookup.add('said')
stopwords_lookup.add('mr')

### Load Data

In [4]:
df = pd.read_csv('fake_or_real_news.csv')

### Drop all columns apart from title and label

In [5]:
for key in df:
    if not (key == 'title' or key == 'label'):
        del df[key]

### Mold data

In [6]:
label_map = {'FAKE' : 0, 'REAL' : 1}
df['label'] = df['label'].map(label_map)

In [7]:
df.shape

(6335, 2)

In [8]:
df.head(10)

Unnamed: 0,title,label
0,You Can Smell Hillary’s Fear,0
1,Watch The Exact Moment Paul Ryan Committed Pol...,0
2,Kerry to go to Paris in gesture of sympathy,1
3,Bernie supporters on Twitter erupt in anger ag...,0
4,The Battle of New York: Why This Primary Matters,1
5,"Tehran, USA",0
6,Girl Horrified At What She Watches Boyfriend D...,0
7,‘Britain’s Schindler’ Dies at 106,1
8,Fact check: Trump and Clinton at the 'commande...,1
9,Iran reportedly makes new push for uranium con...,1


### Split Data to X, y

In [9]:
X = df['title']    # predictor feature
X = X.apply(lambda x: process_text(x, punctuation_lookup, stopwords_lookup)) # apply stopwords through NLTK
X = df['title'].values # get values
y = df['label'].values # predicted class

## Vectorization

In [10]:
# count and tfidf

count_vectorizer = CountVectorizer(max_features=1000)
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

X_count = count_vectorizer.fit_transform(X)
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [19]:
# doc2vec

# adapted from tutorial at https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1

class LabeledSentenceIterator:
    def __init__(self, doc_list, label_list):
        self.doc_list = doc_list
        
        self.label_list = []
        
        self.label_list = ['Fake' if label == 0 else 'Real' for label in label_list]
        
    def __iter__(self):
        num_fake = 0
        num_real = 0
        
        for doc, label in zip(self.doc_list, self.label_list):
            words = doc.split()
            
            if label == 'Fake':
                tag = [label + '_' + str(num_fake)]
                num_fake += 1
                
            elif label == 'Real':
                tag = [label + '_' + str(num_real)]
                num_real += 1
            
            yield LabeledSentence(words = words, tags = tag)

doc_iter = LabeledSentenceIterator(list(X), y)
            
dtov_model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025)

dtov_model.build_vocab(doc_iter)

for epoch in range(10):
    dtov_model.train(doc_iter)
    dtov_model.alpha -= 0.002
    dtov_model.min_alpha = dtov_model.alpha 
    dtov_model.train(doc_iter)
    
# save the trained doc2vec model
# dtov_model.save('doc_to_vec.model')

In [16]:
# load the model, previously trained
# dtov_model = gensim.models.Doc2Vec.load('doc_to_vec.model')

In [31]:
X_doc2vec = []
y_doc2vec = y
num_fake1 = 0
num_real1 = 0
for value in y:
    if (value == 0):
        X_doc2vec.append(dtov_model.docvecs['Fake_' + str(num_fake1)])
        num_fake1 += 1
    else:
        X_doc2vec.append(dtov_model.docvecs['Real_' + str(num_real1)])
        num_real1 += 1
X_doc2vec = np.array(X_doc2vec)
y_doc2vec = np.array(y_doc2vec)

### Split Test and Training Data

In [48]:
split_test_size = 0.30

X_tr_d2v, X_te_d2v, y_tr_d2v, y_te_d2v = train_test_split(X_doc2vec, y_doc2vec, test_size=split_test_size, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=split_test_size, random_state=42) 
                            # test_size = 0.3 is 30%, 42 is the answer to everything
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=split_test_size, random_state=42) 
                            # test_size = 0.3 is 30%, 42 is the answer to everything

In [49]:
def show_results(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    
    y_p = model.predict(X_te)
    
    print("Accuracy Score: {}".format(accuracy_score(y_p, y_te)))
    
    print("\nPrecision/Recall/F-Score/Support: \n")
    pprint(precision_recall_fscore_support(y_te, y_p))
    
    print("\nConfusion Matrix:\n")
    pprint(confusion_matrix(y_te, y_p))

### Baseline Model

Baseline model is a multinomial Naive Bayes classifier using Count vectorization for feature extraction.

In [50]:
mnbModel = MultinomialNB()

show_results(mnbModel, X_train, X_test, y_train, y_test)

Accuracy Score: 0.7890583903208838

Precision/Recall/F-Score/Support: 

(array([ 0.78899083,  0.78913043]),
 array([ 0.79958678,  0.77813505]),
 array([ 0.79425346,  0.78359417]),
 array([968, 933]))

Confusion Matrix:

array([[774, 194],
       [207, 726]])


#### Baseline Model using TfidfVectorizer

In [51]:
show_results(mnbModel, X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf)

Accuracy Score: 0.7859021567596002

Precision/Recall/F-Score/Support: 

(array([ 0.78593272,  0.78586957]),
 array([ 0.7964876 ,  0.77491961]),
 array([ 0.79117496,  0.78035618]),
 array([968, 933]))

Confusion Matrix:

array([[771, 197],
       [210, 723]])


#### Baseline Model using Doc2Vec Model

In [1]:
# show_results(mnbModel, X_tr_d2v, X_te_d2v, y_tr_d2v, y_te_d2v)

### Logistic Regression Model

#### Penalized Logistic Regression with CountVectorizer

In [53]:
logisticModel = LogisticRegression()

show_results(logisticModel, X_train, X_test, y_train, y_test)

Accuracy Score: 0.8043135192004208

Precision/Recall/F-Score/Support: 

(array([ 0.80595483,  0.802589  ]),
 array([ 0.81095041,  0.79742765]),
 array([ 0.8084449,  0.8      ]),
 array([968, 933]))

Confusion Matrix:

array([[785, 183],
       [189, 744]])


#### Penalized Logistic Regression with TfidfVectorizer

In [54]:
logisticModel = LogisticRegression()

show_results(logisticModel, X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf)

Accuracy Score: 0.8001052077853761

Precision/Recall/F-Score/Support: 

(array([ 0.79458918,  0.80620155]),
 array([ 0.81921488,  0.78027867]),
 array([ 0.80671414,  0.79302832]),
 array([968, 933]))

Confusion Matrix:

array([[793, 175],
       [205, 728]])


#### Penalized Logistic Regression with Doc2Vec Model

In [55]:
logisticModel = LogisticRegression()

show_results(logisticModel, X_tr_d2v, X_te_d2v, y_tr_d2v, y_te_d2v)

Accuracy Score: 0.7622304050499737

Precision/Recall/F-Score/Support: 

(array([ 0.76875   ,  0.75557917]),
 array([ 0.76239669,  0.76205788]),
 array([ 0.76556017,  0.7588047 ]),
 array([968, 933]))

Confusion Matrix:

array([[738, 230],
       [222, 711]])


### Random Forest Model

#### Random Forest with CountVectorizer

In [46]:
forestModel = RandomForestClassifier(n_estimators = 100)

show_results(forestModel, X_train, X_test, y_train, y_test)

Accuracy Score: 0.7901104681746449

Precision/Recall/F-Score/Support: 

(array([ 0.79543094,  0.78464819]),
 array([ 0.79132231,  0.78885316]),
 array([ 0.79337131,  0.78674506]),
 array([968, 933]))

Confusion Matrix:

array([[766, 202],
       [197, 736]])


#### Random Forest with TfidfVectorizer

In [58]:
forestModel = RandomForestClassifier(n_estimators = 100)

show_results(forestModel, X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf)

Accuracy Score: 0.7837980010520779

Precision/Recall/F-Score/Support: 

(array([ 0.7696031 ,  0.80069124]),
 array([ 0.82128099,  0.7449089 ]),
 array([ 0.7946027 ,  0.77179345]),
 array([968, 933]))

Confusion Matrix:

array([[795, 173],
       [238, 695]])


#### Random Forest with Doc2Vec

In [59]:
forestModel = RandomForestClassifier(n_estimators = 100)

show_results(forestModel, X_tr_d2v, X_te_d2v, y_tr_d2v, y_te_d2v)

Accuracy Score: 0.769594950026302

Precision/Recall/F-Score/Support: 

(array([ 0.76606426,  0.77348066]),
 array([ 0.78822314,  0.75026795]),
 array([ 0.77698574,  0.7616975 ]),
 array([968, 933]))

Confusion Matrix:

array([[763, 205],
       [233, 700]])


### API Architecture for class prediction of single claim

In [60]:
logisticModel = LogisticRegression()
logisticModel.fit(X_train, y_train)
label = {0:'FAKE', 1:'REAL'}
test = ['Loretta Lynch becomes first African-American woman AG.']
test_vector = count_vectorizer.transform(test)
pred = logisticModel.predict(test_vector)
print(label[pred[0]])
proba = np.max(logisticModel.predict_proba(test_vector)) * 100
print(proba)

FAKE
59.5274171211
