# Fake News Detection

**Author**: Marcelo Scatena
***
March 2022

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [13]:
from nltk.corpus import stopwords
import re
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [25]:
from wordcloud import WordCloud

In [76]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
def print_cm_with_labels(y_true, 
                         y_pred):
    '''
    Takes the true values and predicted values of a classifier and 
    plots a confusion matrix (normalized by predictions) using 
    a list of given display labels.
    '''
    disp_labels = ['True', 'Fake']
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=disp_labels)

    fig, ax = plt.subplots(figsize=(6,6))
    disp.plot(ax=ax)
    ax.grid(False)
    disp.ax_.set_xticklabels(disp_labels)
    return None

In [343]:
from sklearn.metrics import accuracy_score
import numpy as np

## Vectorization

In [2]:
%store -r df_train_clean
%store -r df_val_clean
%store -r df_test_clean

In [3]:
X_train_clean = df_train_clean['statement']
y_train_clean = df_train_clean['label']
X_val_clean = df_val_clean['statement']
y_val_clean = df_val_clean['label']
X_test_clean = df_test_clean['statement']
y_test_clean = df_test_clean['label']

In [4]:
# Import Word2Vec from Python library gensim and apply to processed train data
from gensim.models import Word2Vec
from nltk import word_tokenize

X_train_w2v = X_train_clean.map(word_tokenize)

w2v_model = Word2Vec(X_train_w2v, window=5, min_count=1, workers=4)
w2v_model.train(X_train_w2v, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)

wv = w2v_model.wv

In [5]:
# Create a function to show most similar words, or are closest to the given word
def show_most_similar(word, length=10):
    return wv.most_similar(word, topn=length)

#### Examinig Vectors

In [57]:
show_most_similar('trump')

[('writes', 0.9119263291358948),
 ('payne', 0.8876219987869263),
 ('analyst', 0.8782433271408081),
 ('saysabout', 0.8718438148498535),
 ('insulted', 0.8634847402572632),
 ('affordability', 0.8589801788330078),
 ('sterling', 0.8494797348976135),
 ('clipper', 0.8407131433486938),
 ('zandi', 0.8053166270256042),
 ('said', 0.8003408908843994)]

In [7]:
show_most_similar('war')

[('iraq', 0.9710062742233276),
 ('sept', 0.9674686789512634),
 ('modern', 0.9636507630348206),
 ('longest', 0.9611484408378601),
 ('troop', 0.9578167796134949),
 ('admit', 0.9552557468414307),
 ('afghanistan', 0.9550725817680359),
 ('1800s', 0.9549057483673096),
 ('producing', 0.9547255635261536),
 ('reagan', 0.9537568092346191)]

In [8]:
show_most_similar('election')

[('confirmed', 0.976656436920166),
 ('nominated', 0.9764069318771362),
 ('vacancy', 0.9699544906616211),
 ('november', 0.9676817655563354),
 ('elected', 0.9673665761947632),
 ('beat', 0.9670543074607849),
 ('came', 0.9668200612068176),
 ('2000', 0.9646468162536621),
 ('victor', 0.9632003903388977),
 ('reed', 0.9631231427192688)]

In [9]:
show_most_similar('president')

[('barack', 0.9574108123779297),
 ('administration', 0.9403015375137329),
 ('george', 0.9343157410621643),
 ('bush', 0.9321780800819397),
 ('obama', 0.9295011758804321),
 ('w', 0.92638099193573),
 ('rescind', 0.920987606048584),
 ('august', 0.9138213396072388),
 ('unilateral', 0.913422167301178),
 ('obamas', 0.9127466082572937)]

In [10]:
show_most_similar('health')

[('preventive', 0.9731147885322571),
 ('uncompensated', 0.9614614248275757),
 ('affordable', 0.9382889866828918),
 ('ration', 0.9203252792358398),
 ('affairshealth', 0.9118868708610535),
 ('act', 0.9105225205421448),
 ('slapped', 0.9076265096664429),
 ('wallace', 0.8982751965522766),
 ('soar', 0.8945285677909851),
 ('premium', 0.8909482955932617)]

In [61]:
wv.most_similar(positive=['health'], negative=['health'])

[('shortcoming', 0.0),
 ('frisked', 0.0),
 ('ernesto', 0.0),
 ('smear', 0.0),
 ('bled', 0.0),
 ('bound', 0.0),
 ('gemma', 0.0),
 ('stranded', 0.0),
 ('materialize', 0.0),
 ('mature', 0.0)]

In [19]:
X_train_w2v

0.0        [say, annies, list, political, group, support,...
1.0        [decline, coal, start, started, natural, gas, ...
2.0        [hillary, clinton, agrees, john, mccain, votin...
3.0        [health, care, reform, legislation, likely, ma...
4.0               [economic, turnaround, started, end, term]
                                 ...                        
10264.0    [larger, number, shark, attack, florida, case,...
10265.0    [democrat, become, party, atlanta, metro, area...
10266.0    [say, alternative, social, security, operates,...
10267.0    [lifting, u, cuban, embargo, allowing, travel,...
10268.0    [department, veteran, affair, manual, telling,...
Name: statement, Length: 10240, dtype: object

In [23]:
from nltk.collocations import *

In [25]:
# from nltk.collocations import *
# from nltk import TweetTokenizer

# tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

# X_train_twt = tweet_tokenizer.tokenize(' '.join(X_train_w2v.apply(lambda x: ' '.join(x))))

In [31]:
X_train_twt2 = word_tokenize(' '.join(X_train_w2v.apply(lambda x: ' '.join(x))))

In [35]:
# Use BigramAssocMeasures from nltk
bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_words(X_train_twt2)
bigrams = finder.score_ngrams(bigram_measures.raw_freq)

In [36]:
# Display the top 50 bigrams
bigrams[:50]

[(('health', 'care'), 0.0035441821349192046),
 (('barack', 'obama'), 0.002574466441829931),
 (('united', 'state'), 0.002565884887023831),
 (('hillary', 'clinton'), 0.0016562400775772554),
 (('president', 'barack'), 0.0013473041045576639),
 (('donald', 'trump'), 0.0011585098988234688),
 (('social', 'security'), 0.0011241836795990697),
 (('president', 'obama'), 0.00111560212479297),
 (('scott', 'walker'), 0.0010898574603746707),
 (('health', 'insurance'), 0.0009611341382831742),
 (('mitt', 'romney'), 0.000926807919058775),
 (('rhode', 'island'), 0.0008238292613855779),
 (('tax', 'cut'), 0.0008066661517733783),
 (('say', 'president'), 0.0007980845969672786),
 (('last', 'year'), 0.0007895030421611787),
 (('year', 'say'), 0.0007551768229367797),
 (('say', 'hillary'), 0.0007294321585184804),
 (('new', 'jersey'), 0.0007208506037123806),
 (('federal', 'government'), 0.0006951059392940813),
 (('illegal', 'immigrant'), 0.0006521981652635824),
 (('income', 'tax'), 0.0006521981652635824),
 (('say'

In [44]:
w2v_model

<gensim.models.word2vec.Word2Vec at 0x1dfe432d160>

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(w2v_model.wv, y_train_clean)
Accuracy = logreg.score(X_val_clean, y_val_clean)
print(Accuracy*100)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(w2v_model.wv, y_train_clean)
Accuracy = logreg.score(X_val_clean, y_val_clean)
print(Accuracy*100)

In [53]:
len(wv.vectors)

10639

In [54]:
len(y_train_clean)

10240

In [62]:
X_train_clean

0.0        say annies list political group support third ...
1.0        decline coal start started natural gas took st...
2.0        hillary clinton agrees john mccain voting give...
3.0        health care reform legislation likely mandate ...
4.0                     economic turnaround started end term
                                 ...                        
10264.0    larger number shark attack florida case voter ...
10265.0       democrat become party atlanta metro area black
10266.0    say alternative social security operates galve...
10267.0         lifting u cuban embargo allowing travel cuba
10268.0    department veteran affair manual telling veter...
Name: statement, Length: 10240, dtype: object

In [63]:
target = y_train_clean
data = X_train_clean.map(word_tokenize).values

In [66]:
total_vocabulary = set(word for headline in data for word in headline)

In [68]:
len(total_vocabulary)
print('There are {} unique tokens in the dataset.'.format(len(total_vocabulary)))

There are 10639 unique tokens in the dataset.


In [72]:
import numpy as np

In [73]:
glove = {}
with open('glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [75]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [110]:
w2vect = W2vVectorizer(glove)

In [115]:
a = w2vect.transform(df_train_clean['statement'])

In [117]:
a.shape

(10240, 50)

In [93]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

rf =  Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
              ('Random Forest', RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=22, ccp_alpha=0.1, verbose=True))])
svc = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
              ('Logistic Regression', LogisticRegression(C=0.1, solver='liblinear'))])
sgd = Pipeline([('Vectorizer', W2vVectorizer(glove)),
                ('Stochastic Gradient Descent', SGDClassifier(fit_intercept=False, early_stopping=True, alpha=0.01, loss='log'))])

In [94]:
models = [('Random Forest', rf),
          ('Support Vector Machine', svc),
          ('Logistic Regression', lr),
          ('Stochastic Gradient Descent', sgd)]

In [95]:
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [96]:
scores

[('Random Forest', 0.6447265625),
 ('Support Vector Machine', 0.64541015625),
 ('Logistic Regression', 0.6480468749999999),
 ('Stochastic Gradient Descent', 0.64658203125)]