# Fake News Classifier Using LSTM

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('C:/Users/User/Desktop/19Nov/21/NLP/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


We are using 'title' as independent variable and predictiing 'label' which shows it is fake news or not.

In [3]:
#Drop na values
df = df.dropna()

In [4]:
df.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0
10,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that...",0
11,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...,0


In [5]:
#here we can see rows with na values like 8 are dropped. So we will reset index
df.reset_index(inplace=True)

In [6]:
df.head(10)

Unnamed: 0,index,id,title,author,text,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
7,9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0
8,10,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that...",0
9,11,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...,0


In [7]:
# Get independednt and dependent variable

X = df.drop('label', axis=1)
y = df['label']

In [8]:
print(X.shape)
print(y.shape)

(18285, 5)
(18285,)


In [9]:
msg = X.copy()

# 1. Data Preprocessing - Data cleaning

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [11]:
#whole data title is cleaned - lower, remove punc, split, stemmed.
ps = PorterStemmer()
corpus = []

for i in range(len(msg)):
    review = re.sub('[^a-zA-Z]',' ',msg['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# 2. One hot representation

In [12]:
from tensorflow.keras.preprocessing.text import one_hot

In [13]:
vocab_size=5000

In [14]:
onehot_repr = [one_hot(words, vocab_size) for words in corpus]

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
set_length = 20

In [17]:
embedded = pad_sequences(onehot_repr,padding='pre',maxlen=set_length)
embedded

array([[   0,    0,    0, ...,   88, 3771, 3070],
       [   0,    0,    0, ..., 3700, 3302, 1779],
       [   0,    0,    0, ..., 1653, 1102, 2723],
       ...,
       [   0,    0,    0, ..., 4727, 1662, 1288],
       [   0,    0,    0, ..., 2563, 2489, 4458],
       [   0,    0,    0, ...,  651,  451,   40]])

In [18]:
embedded[0]  #first sentence have 20 length now

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 4507,
       2289, 4460, 2021, 3592,  976, 2439,   88, 3771, 3070])

In [19]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [20]:
# Creating model

dim = 40
model = Sequential()
model.add(Embedding(vocab_size, dim, input_length = set_length))
model.add(LSTM(100))                               #train using LSTM - 1 layer with 100 neurons
                                                   #vector produced by embedding layer is given as input to LSTM for training
model.add(Dense(1,activation='sigmoid'))  #dense layer=1 as it is classification problem - 1 final o/p, in which we apply sigmoid 
                                          # activation func-which tells propbability of class 1 or class 2
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
y

0        1
1        0
2        1
3        1
4        1
        ..
18280    0
18281    0
18282    0
18283    1
18284    1
Name: label, Length: 18285, dtype: int64

In [22]:
import numpy as np
X_final = np.array(embedded)
y_final = np.array(y)

In [23]:
print(X_final.shape)
print(y_final.shape)

(18285, 20)
(18285,)


In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

## Model Training

In [25]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1cd6b54b808>

In [26]:
y_pred = model.predict_classes(X_test)



In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[3053,  366],
       [ 203, 2413]], dtype=int64)

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9057166528583265