# Fake news detection
## Logistic regression, Naive Bayes & Support Vector Machine with TF-IDF

### Setup

In [1]:
import pandas as pd
import numpy as np
import time
import eli5
import string
import re

from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train.columns = ["text", "label"]
test.columns = ["label", "text"]

train = train[train['label'] != "Label"]
test = test[test['label'] != "Label"]

test = test[["text", "label"]]

In [3]:
# Remove non-ACSII characters

train["text"] = train["text"].apply(lambda x: x.encode("ascii", errors="ignore").decode())
test["text"] = test["text"].apply(lambda x: x.encode("ascii", errors="ignore").decode())

In [4]:
train.head()

Unnamed: 0,text,label
0,Report: War Looms - Hundreds of American Troop...,0
1,I walked through a preview of the artist Pedro...,1
2,Neo-Con Hypocrites Leverage Human Rights Again...,0
3,"A giant beef roast for the holidays is, for ma...",1
4,Seeking to appeal to guests desire for new exp...,1


In [5]:
x_trainval, y_trainval = train["text"], train["label"]
x_test, y_test = test["text"], test["label"]

In [6]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(x_trainval, y_trainval, test_size = 0.2, random_state = 123)

In [7]:
print(len(x_train), len(y_train))
print(len(x_val), len(y_val))
print(len(x_test), len(y_test))

63617 63617
15905 15905
7011 7011


### Naive-Bayes classification

In [8]:
pipe1 = Pipeline([('vect', TfidfVectorizer()), ('model', MultinomialNB())])
start_time = time.time()
print("Fitting started...")
model_nb = pipe1.fit(x_train, y_train)
print("Fitting took")
print("--- %s seconds ---" % (time.time() - start_time))

Fitting started...
Fitting took
--- 78.26141571998596 seconds ---


In [9]:
nb_pred = model_nb.predict(x_val)
target = y_val

print("Accuracy of Naive Bayes Classifier: {}%".format(round(accuracy_score(target, nb_pred) * 100, 2)))
print("\nConfusion Matrix of Naive Bayes Classifier:\n")
print(confusion_matrix(target, nb_pred))
print("\nClassification Report of Naive Bayes Classifier:\n")
print(classification_report(target, nb_pred, digits = 4))

Accuracy of Naive Bayes Classifier: 88.85%

Confusion Matrix of Naive Bayes Classifier:

[[6272 1520]
 [ 254 7859]]

Classification Report of Naive Bayes Classifier:

              precision    recall  f1-score   support

           0     0.9611    0.8049    0.8761      7792
           1     0.8379    0.9687    0.8986      8113

    accuracy                         0.8885     15905
   macro avg     0.8995    0.8868    0.8873     15905
weighted avg     0.8983    0.8885    0.8876     15905



In [10]:
nb_pred = model_nb.predict(x_test)
target = y_test

print("Accuracy of Naive Bayes Classifier: {}%".format(round(accuracy_score(target, nb_pred) * 100, 2)))
print("\nConfusion Matrix of Naive Bayes Classifier:\n")
print(confusion_matrix(target, nb_pred))
print("\nClassification Report of Naive Bayes Classifier:\n")
print(classification_report(target, nb_pred, digits = 4))

Accuracy of Naive Bayes Classifier: 84.55%

Confusion Matrix of Naive Bayes Classifier:

[[1584  658]
 [ 425 4344]]

Classification Report of Naive Bayes Classifier:

              precision    recall  f1-score   support

           0     0.7885    0.7065    0.7452      2242
           1     0.8685    0.9109    0.8892      4769

    accuracy                         0.8455      7011
   macro avg     0.8285    0.8087    0.8172      7011
weighted avg     0.8429    0.8455    0.8431      7011



In [11]:
print(pipe1)
clf = pipe1['model']
vec = pipe1['vect']

eli5.show_weights(clf, vec=vec, top=20)

Pipeline(steps=[('vect', TfidfVectorizer()), ('model', MultinomialNB())])


### Support Vector Classification

In [12]:
pipe2 = Pipeline([('vect', TfidfVectorizer()), ('model', LinearSVC())])

print("Fitting started...")
model_svc = pipe2.fit(x_train, y_train)
print("Fitting took")
print("--- %s seconds ---" % (time.time() - start_time))

Fitting started...
Fitting took
--- 211.14234328269958 seconds ---


In [13]:
svc_pred = model_svc.predict(x_val)
target = y_val

print("Accuracy of SVM Classifier: {}%".format(round(accuracy_score(target, svc_pred) * 100, 2)))
print("\nConfusion Matrix of SVM Classifier:\n")
print(confusion_matrix(target, svc_pred))
print("\nClassification Report of SVM Classifier:\n")
print(classification_report(target, svc_pred, digits=4))

Accuracy of SVM Classifier: 97.04%

Confusion Matrix of SVM Classifier:

[[7584  208]
 [ 263 7850]]

Classification Report of SVM Classifier:

              precision    recall  f1-score   support

           0     0.9665    0.9733    0.9699      7792
           1     0.9742    0.9676    0.9709      8113

    accuracy                         0.9704     15905
   macro avg     0.9703    0.9704    0.9704     15905
weighted avg     0.9704    0.9704    0.9704     15905



In [14]:
svc_pred = model_svc.predict(x_test)
target = y_test

print("Accuracy of SVM Classifier: {}%".format(round(accuracy_score(target, svc_pred) * 100, 2)))
print("\nConfusion Matrix of SVM Classifier:\n")
print(confusion_matrix(target, svc_pred))
print("\nClassification Report of SVM Classifier:\n")
print(classification_report(target, svc_pred, digits=4))

Accuracy of SVM Classifier: 87.66%

Confusion Matrix of SVM Classifier:

[[1849  393]
 [ 472 4297]]

Classification Report of SVM Classifier:

              precision    recall  f1-score   support

           0     0.7966    0.8247    0.8104      2242
           1     0.9162    0.9010    0.9086      4769

    accuracy                         0.8766      7011
   macro avg     0.8564    0.8629    0.8595      7011
weighted avg     0.8780    0.8766    0.8772      7011



In [15]:
print(pipe2)
clf_svc = pipe2['model']
vec_svc = pipe2['vect']

eli5.show_weights(clf_svc, vec=vec_svc, top=30)

Pipeline(steps=[('vect', TfidfVectorizer()), ('model', LinearSVC())])


Weight?,Feature
+6.220,related
+4.673,photograph
+4.623,but
+4.035,editor
+3.480,theguardian
+3.456,mr
+3.364,taiwan
+3.064,calif
+2.970,its
+2.887,guardian


In [16]:
test['predicted_label_svc'] = svc_pred

svc_misclassified_true_as_fake = test[(test['label'] != test['predicted_label_svc']) & (test['label'] == "1")]
svc_misclassified_fake_as_true = test[(test['label'] != test['predicted_label_svc']) & (test['label'] == "0")]

In [17]:
svc_misclassified_true_as_fake.head()

Unnamed: 0,text,label,predicted_label_svc
2,"Debt: $20, 000, Source: College, credit cards,...",1,0
27,When Elon Musk isnt outlining plans to use his...,1,0
48,Jurors have awarded a University of Virginia a...,1,0
57,Authorities perplexed as more than 100 members...,1,0
59,Researchers said they believe they have locate...,1,0


In [18]:
svc_misclassified_fake_as_true.head()

Unnamed: 0,text,label,predicted_label_svc
4774,In the aftermath of Scotlands no vote in the ...,0,1
4782,After being banned by the N.B.A. Tuesday afte...,0,1
4787,Fresh from the 2012 Republican National Conven...,0,1
4794,Moments after President Obama said he would a...,0,1
4797,Speaker of the House John Boehner (R-Ohio) sai...,0,1


### Logistic Regression classification

In [19]:
pipe3 = Pipeline([('vect', TfidfVectorizer()), ('model', LogisticRegression())])

print("Fitting started...")
model_lr = pipe3.fit(x_train, y_train)
print("Fitting took")
print("--- %s seconds ---" % (time.time() - start_time))

Fitting started...
Fitting took
--- 363.49492835998535 seconds ---


In [20]:
lr_pred = model_lr.predict(x_val)
target = y_val

print("Accuracy of Logistic Regression Classifier: {}%".format(round(accuracy_score(target, lr_pred) * 100, 2)))
print("\nConfusion Matrix of Logistic Regression Classifier:\n")
print(confusion_matrix(target, lr_pred))
print("\nCLassification Report of Logistic Regression Classifier:\n")
print(classification_report(target, lr_pred, digits=4))

Accuracy of Logistic Regression Classifier: 96.04%

Confusion Matrix of Logistic Regression Classifier:

[[7533  259]
 [ 371 7742]]

CLassification Report of Logistic Regression Classifier:

              precision    recall  f1-score   support

           0     0.9531    0.9668    0.9599      7792
           1     0.9676    0.9543    0.9609      8113

    accuracy                         0.9604     15905
   macro avg     0.9603    0.9605    0.9604     15905
weighted avg     0.9605    0.9604    0.9604     15905



In [21]:
lr_pred = model_lr.predict(x_test)
target = y_test

print("Accuracy of Logistic Regression Classifier: {}%".format(round(accuracy_score(target, lr_pred) * 100, 2)))
print("\nConfusion Matrix of Logistic Regression Classifier:\n")
print(confusion_matrix(target, lr_pred))
print("\nCLassification Report of Logistic Regression Classifier:\n")
print(classification_report(target, lr_pred, digits=4))

Accuracy of Logistic Regression Classifier: 87.25%

Confusion Matrix of Logistic Regression Classifier:

[[1859  383]
 [ 511 4258]]

CLassification Report of Logistic Regression Classifier:

              precision    recall  f1-score   support

           0     0.7844    0.8292    0.8062      2242
           1     0.9175    0.8928    0.9050      4769

    accuracy                         0.8725      7011
   macro avg     0.8509    0.8610    0.8556      7011
weighted avg     0.8749    0.8725    0.8734      7011



In [22]:
print(pipe3)
clf_lr = pipe3['model']
vec_lr = pipe3['vect']

eli5.show_weights(clf_lr, vec=vec_lr, top=30)

Pipeline(steps=[('vect', TfidfVectorizer()), ('model', LogisticRegression())])


Weight?,Feature
+11.106,but
+10.288,mr
+8.828,related
+8.062,in
+7.509,photograph
+7.054,its
+7.010,on
+6.770,taiwan
+6.286,says
+5.681,guardian


In [23]:
test['predicted_label_lr'] = lr_pred

lr_misclassified_true_as_fake = test[(test['label'] != test['predicted_label_lr']) & (test['label'] == "1")]
lr_misclassified_fake_as_true = test[(test['label'] != test['predicted_label_lr']) & (test['label'] == "0")]

In [24]:
lr_misclassified_true_as_fake.head()

Unnamed: 0,text,label,predicted_label_svc,predicted_label_lr
2,"Debt: $20, 000, Source: College, credit cards,...",1,0,0
8,The FBI has arrested a National Security Agenc...,1,1,0
11,Name: Pamela Anderson. Age: 49. Occupation: De...,1,1,0
24,What does Gary Johnson know? After yet another...,1,1,0
27,When Elon Musk isnt outlining plans to use his...,1,0,0


In [25]:
lr_misclassified_fake_as_true.head()

Unnamed: 0,text,label,predicted_label_svc,predicted_label_lr
4774,In the aftermath of Scotlands no vote in the ...,0,1,1
4782,After being banned by the N.B.A. Tuesday afte...,0,1,1
4784,Former Vice-President Dick Cheney broke his si...,0,0,1
4787,Fresh from the 2012 Republican National Conven...,0,1,1
4788,In the latest publicity coup for the Afghan in...,0,0,1


### Visualisation

In [26]:
# Example where SVC labelled a True article as Fake

eli5.show_prediction(clf_svc, svc_misclassified_true_as_fake['text'].values[0], vec=vec_svc)

Contribution?,Feature
0.546,<BIAS>
-0.196,Highlighted in text (sum)


In [27]:
# Example where LR labelled True article as Fake

eli5.show_prediction(clf_lr, lr_misclassified_true_as_fake['text'].values[1], vec=vec_lr)

Contribution?,Feature
1.571,<BIAS>
-1.335,Highlighted in text (sum)


In [28]:
# Example where SVC labelled a Fake article as True

eli5.show_prediction(clf_svc, svc_misclassified_fake_as_true['text'].values[3], vec=vec_svc)

Contribution?,Feature
0.847,Highlighted in text (sum)
-0.546,<BIAS>


In [29]:
# Example where LR labelled Fake article as True

eli5.show_prediction(clf_lr, lr_misclassified_fake_as_true['text'].values[4], vec=vec_lr)

Contribution?,Feature
2.147,Highlighted in text (sum)
-1.571,<BIAS>
