### Fake News Classification using TF-IDF

### Please use Google Colab to run thsi code.

### Here, we are using 'Text' paramaeter which has big paragraphs, so it will take huge time to clean the text. 

In [2]:
import pandas as pd
import numpy as np
import nltk

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
df = df.dropna()

In [8]:
## Dependent and Independent features

X= df.drop('label', axis=1)
y = df['label']

In [9]:
X.shape

(18285, 4)

In [10]:
y.shape

(18285,)

In [11]:
# copy the data
news = df.copy()

In [12]:
news.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [13]:
news.reset_index(inplace=True)

In [14]:
news['text'][2]

'Why the Truth Might Get You Fired October 29, 2016 \nThe tension between intelligence analysts and political policymakers has always been between honest assessments and desired results, with the latter often overwhelming the former, as in the Iraq War, writes Lawrence Davidson. \nBy Lawrence Davidson \nFor those who might wonder why foreign policy makers repeatedly make bad choices, some insight might be drawn from the following analysis. The action here plays out in the United States, but the lessons are probably universal. \nBack in the early spring of 2003, George W. Bush initiated the invasion of Iraq. One of his key public reasons for doing so was the claim that the country’s dictator, Saddam Hussein, was on the verge of developing nuclear weapons and was hiding other weapons of mass destruction. The real reason went beyond that charge and included a long-range plan for “regime change” in the Middle East. President George W. Bush and Vice President Dick Cheney receive an Oval Off

In [17]:
## Now we will clean the data
import re
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

ps = PorterStemmer()

corpus = []

for i in range(0, len(news)):
    review = re.sub('[^a-zA-Z]', ' ', news['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
corpus[0]

'hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel case know anthoni weiner sext teenag appar littl thing fact matter ch

In [19]:
## Now create the count vectorizer using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()

In [20]:
X.shape

(18285, 5000)

In [21]:
y = news['label']

In [22]:
## Divide the data into train and test 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [23]:
cv.get_feature_names()

['aaron',
 'abandon',
 'abc',
 'abe',
 'abedin',
 'abil',
 'abl',
 'abort',
 'abroad',
 'absenc',
 'absolut',
 'absorb',
 'absurd',
 'abu',
 'abus',
 'academ',
 'academi',
 'acceler',
 'accept',
 'access',
 'access pipelin',
 'accid',
 'accommod',
 'accompani',
 'accomplish',
 'accord',
 'accord report',
 'account',
 'accumul',
 'accur',
 'accus',
 'achiev',
 'acid',
 'acknowledg',
 'acquir',
 'acr',
 'across',
 'across countri',
 'act',
 'action',
 'activ',
 'activist',
 'actor',
 'actress',
 'actual',
 'ad',
 'adam',
 'adapt',
 'add',
 'addict',
 'addit',
 'address',
 'adjust',
 'administr',
 'administr offici',
 'admir',
 'admiss',
 'admit',
 'adopt',
 'adult',
 'advanc',
 'advantag',
 'adventur',
 'adversari',
 'advertis',
 'advic',
 'advis',
 'advisor',
 'advisori',
 'advoc',
 'advocaci',
 'affair',
 'affect',
 'affili',
 'affirm',
 'afford',
 'afford care',
 'afford care act',
 'afghan',
 'afghanistan',
 'afraid',
 'africa',
 'african',
 'african american',
 'aftermath',
 'aftern

In [24]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [25]:
count_df = pd.DataFrame(X_train, columns=cv.get_feature_names())
count_df.head()

Unnamed: 0,aaron,abandon,abc,abe,abedin,abil,abl,abort,abroad,absenc,absolut,absorb,absurd,abu,abus,academ,academi,acceler,accept,access,access pipelin,accid,accommod,accompani,accomplish,accord,accord report,account,accumul,accur,accus,achiev,acid,acknowledg,acquir,acr,across,across countri,act,action,...,written,wrong,wrongdo,wrote,wrote twitter,www,xi,yahoo,yard,ye,yeah,year,year ago,year later,year mr,year old,year said,year sinc,yell,yellow,yemen,yesterday,yet,yet anoth,yiannopoulo,yield,york,york citi,york time,yorker,young,young peopl,younger,youth,youtub,zero,zika,zionist,zone,zu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03168,0.0,0.0,0.0,0.0,0.0,0.014363,0.022439,0.013583,0.014503,...,0.018694,0.0,0.0,0.014552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040828,0.0,0.0,0.02411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.017174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016251,0.0,0.0,0.032821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025582,0.0,0.0,0.0247,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.023192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026189,0.0,0.0,0.0,0.0,0.019139,0.020436,...,0.0,0.024738,0.0,0.020504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033653,0.0,0.0,0.0,0.0,0.018248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Machine Learning Model

### Naive Bayes

In [26]:
## Now our dataset is ready. We can apply the algorithms

## Naive Bayes

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [27]:
model = classifier.fit(X_train, y_train)

In [28]:
pred = model.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test, pred)

In [30]:
print('accuracy is:\n ', accuracy)
print('\n confusion matrix is: ', cm)

accuracy is:
  0.9017499088589136

 confusion matrix is:  [[2949  135]
 [ 404 1998]]


In [31]:
feature_names = cv.get_feature_names()

In [32]:
## Most real words
sorted(zip(model.coef_[0], feature_names), reverse=True)[:20]

[(-5.399972283587459, 'clinton'),
 (-5.510391828260856, 'trump'),
 (-5.692841393345777, 'hillari'),
 (-5.925169399964115, 'us'),
 (-5.945808691430057, 'elect'),
 (-6.143696264294611, 'peopl'),
 (-6.166923671496486, 'vote'),
 (-6.192249745630834, 'state'),
 (-6.216918333668217, 'email'),
 (-6.232457956448439, 'one'),
 (-6.309866492145156, 'hillari clinton'),
 (-6.327324781938726, 'fbi'),
 (-6.3275587604552666, 'would'),
 (-6.334290204768882, 'like'),
 (-6.373370496860565, 'american'),
 (-6.396252660014878, 'time'),
 (-6.411463428528187, 'war'),
 (-6.421828575711264, 'world'),
 (-6.454918767155662, 'year'),
 (-6.4642060350936665, 'octob')]

In [33]:
## Most fake
sorted(zip(model.coef_[0], feature_names))[:5000]

[(-10.93712300355258, 'brief post'),
 (-10.93712300355258, 'follow pam'),
 (-10.93712300355258, 'follow pam key'),
 (-10.93712300355258, 'gold medal'),
 (-10.93712300355258, 'gorsuch'),
 (-10.93712300355258, 'judg gorsuch'),
 (-10.93712300355258, 'key twitter'),
 (-10.93712300355258, 'key twitter pamkeynen'),
 (-10.93712300355258, 'morn brief'),
 (-10.93712300355258, 'mr ail'),
 (-10.93712300355258, 'mr bannon'),
 (-10.93712300355258, 'mr castro'),
 (-10.93712300355258, 'mr christi'),
 (-10.93712300355258, 'mr cruz'),
 (-10.93712300355258, 'mr de'),
 (-10.93712300355258, 'mr flynn'),
 (-10.93712300355258, 'mr kushner'),
 (-10.93712300355258, 'mr roof'),
 (-10.93712300355258, 'mr sander'),
 (-10.93712300355258, 'mr scott'),
 (-10.93712300355258, 'mr session'),
 (-10.93712300355258, 'mr spicer'),
 (-10.93712300355258, 'mr xi'),
 (-10.93712300355258, 'ms trump'),
 (-10.93712300355258, 'pam key'),
 (-10.93712300355258, 'pam key twitter'),
 (-10.93712300355258, 'pamkeynen'),
 (-10.937123003

### Random forest classifier

In [34]:
### Random forest

from sklearn.ensemble import RandomForestClassifier
Rf = RandomForestClassifier(n_estimators=200, criterion='entropy')

In [35]:
rf_model = Rf.fit(X_train, y_train)

In [36]:
rf_pred = rf_model.predict(X_test)

In [37]:
accuracy_rf = accuracy_score(y_test, rf_pred)
cm_rf = confusion_matrix(y_test, rf_pred)

In [38]:
print('accuracy is:\n ', accuracy_rf)
print('\n confusion matrix is: ', cm_rf)

accuracy is:
  0.943127962085308

 confusion matrix is:  [[2989   95]
 [ 217 2185]]
