# Fake News Detection

In [1]:
# Importing necessary libraries

import os
import pandas as pd
import numpy as np
import sklearn
import nltk
import warnings
warnings.filterwarnings("ignore")

### Importing the datasets

In [2]:
df = os.chdir("D:\\Naveen\\Interview Assignments\\TomTom")

In [3]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [5]:
#check shape

print(train.shape, test.shape)

(20800, 5) (5200, 4)


In [6]:
#Checking for missing values

print(train.info())
print("-------------------")
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB
None
-------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB
None


There are a lot of missing values. But they are all unique headlines and names of authors so we can fill all of them with empty values as they won't help in prediction. 

### Data Pre-processing

In [7]:
test=test.fillna(' ')
train=train.fillna(' ')
test['total']=test['title']+' '+test['author']+test['text']
train['total']=train['title']+' '+train['author']+train['text']

In [28]:
print(train.info())
print("-------------------")
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20800 non-null  object
 2   author  20800 non-null  object
 3   text    20800 non-null  object
 4   label   20800 non-null  int64 
 5   total   20800 non-null  object
dtypes: int64(2), object(4)
memory usage: 975.1+ KB
None
-------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5200 non-null   object
 2   author  5200 non-null   object
 3   text    5200 non-null   object
 4   total   5200 non-null   object
dtypes: int64(1), object(4)
memory usage: 203.2+ KB
None


There are no missing values now.

### TF-IDF

This is used to determine how important a word is to the document. Here we use "smooth_idf = False" because the terms that occur in all the records in the training set should not be entirely ignored. 

Reference: 

1. https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

### CountVectorizer

This is used to encode words into integers. This converts the text records into token counts

Reference: 

1. https://www.educative.io/edpresso/countvectorizer-in-python
2. https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)

In [9]:
targets = train['label'].values
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = transformer.fit_transform(test_counts)

In [10]:
#splitting into training and test set 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, random_state=0)

### Fitting into a model

#### Logistic Regression 

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[2398  166]
 [ 120 2516]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      2564
           1       0.94      0.95      0.95      2636

    accuracy                           0.94      5200
   macro avg       0.95      0.94      0.94      5200
weighted avg       0.95      0.94      0.94      5200



In [12]:
#cross validation - 5 fold

from sklearn.model_selection import cross_val_score
print(cross_val_score(logreg, X_train, y_train, cv=5))

[0.93814103 0.94551282 0.94839744 0.94134615 0.94647436]


#### Decision Tree Classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)

y_pred = dec_tree.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[2460  104]
 [  89 2547]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2564
           1       0.96      0.97      0.96      2636

    accuracy                           0.96      5200
   macro avg       0.96      0.96      0.96      5200
weighted avg       0.96      0.96      0.96      5200



In [14]:
#cross validation - 5 fold

print(cross_val_score(dec_tree, X_train, y_train, cv=5))

[0.96057692 0.9625     0.96153846 0.96346154 0.96570513]


#### Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 200, min_samples_leaf = 3, max_features = 0.5, n_jobs = -1)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[2490   74]
 [  63 2573]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2564
           1       0.97      0.98      0.97      2636

    accuracy                           0.97      5200
   macro avg       0.97      0.97      0.97      5200
weighted avg       0.97      0.97      0.97      5200



In [16]:
#cross validation - 5 fold

print(cross_val_score(rfc, X_train, y_train, cv=5))

[0.97403846 0.96826923 0.97307692 0.97628205 0.975     ]


#### Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[2561    3]
 [1126 1510]]
              precision    recall  f1-score   support

           0       0.69      1.00      0.82      2564
           1       1.00      0.57      0.73      2636

    accuracy                           0.78      5200
   macro avg       0.85      0.79      0.77      5200
weighted avg       0.85      0.78      0.77      5200



In [18]:
#cross validation - 5 fold

print(cross_val_score(mnb, X_train, y_train, cv=5))

[0.77532051 0.75833333 0.77532051 0.77211538 0.77307692]


#### KNN Classifier

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1780  784]
 [ 354 2282]]
              precision    recall  f1-score   support

           0       0.83      0.69      0.76      2564
           1       0.74      0.87      0.80      2636

    accuracy                           0.78      5200
   macro avg       0.79      0.78      0.78      5200
weighted avg       0.79      0.78      0.78      5200



In [20]:
#cross validation - 5 fold

print(cross_val_score(knn, X_train, y_train, cv=5))

[0.76442308 0.78012821 0.79935897 0.7849359  0.78365385]


### Checking accuracy scores of test set

In [21]:
#Logistic Regression
acc_logreg = logreg.score(X_test, y_test)*100
print(acc_logreg)

94.5


In [22]:
#Decision Tree Classifier
acc_dec_tree = dec_tree.score(X_test, y_test)*100
print(acc_dec_tree)

96.28846153846153


In [23]:
#Random Forest Classifier
acc_rfc = rfc.score(X_test, y_test)*100
print(acc_rfc)

97.36538461538461


In [24]:
#Naive Bayes
acc_mnb = mnb.score(X_test, y_test)*100
print(acc_mnb)

78.28846153846155


In [25]:
#KNN Classifier
acc_knn = knn.score(X_test, y_test)*100
print(acc_knn)

78.11538461538461


### Sorting according to test data accuracy

In [26]:
models = pd.DataFrame({
'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'KNN', 'Naive Bayes'],
'Score': [acc_logreg, acc_dec_tree, acc_rfc, acc_knn, acc_mnb]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
2,Random Forest,97.365385
1,Decision Tree,96.288462
0,Logistic Regression,94.5
4,Naive Bayes,78.288462
3,KNN,78.115385


### Predicted output

In [27]:
test_id = test["id"]
submission = pd.DataFrame({"id": test_id, "label": rfc.predict(X_test)})
submission.to_csv('D:\\Naveen\\Interview Assignments\\TomTom\\submission.csv', index=False)