<a href="https://colab.research.google.com/github/mddanish72/Fake_News_Classifier_Using_LSTM/blob/main/Fake_News_Classifier_Using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Dataset: https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification

Steps:

1. Dataset
2. Independent and dependent features
3. Cleaning the data
4. Fix a sentence length to fix the input
5. One hot representation, Embedding layer
6. LSTM neural Network

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df=pd.read_csv('/content/drive/My Drive/Datasets/WELFake_Dataset.csv')
df.head()

Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [2]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,558
text,39
label,0


In [3]:
df.shape

(72134, 4)

In [4]:
# Drop the null values

df=df.dropna()

In [5]:
# Get the independent features

x=df.drop('label',axis=1)
x.head()

Unnamed: 0.1,Unnamed: 0,title,text
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ..."
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will..."
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...


In [6]:
# Get the dependent feature

y=df['label']
y.head()

Unnamed: 0,label
0,1
2,1
3,0
4,1
5,1


In [7]:
print(x.shape,y.shape)

(71537, 3) (71537,)


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [9]:
# Vocabulary Size
voc_size=5000

In [10]:
messages=x.copy()
messages.reset_index(drop=True,inplace=True)
messages.head()

Unnamed: 0.1,Unnamed: 0,title,text
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...
1,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ..."
2,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...
3,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will..."
4,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...


In [11]:
!pip install nltk



In [12]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
## Data preprocessing

from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
corpus=[]
for i in range(0,len(messages)):
  review=re.sub('[^a-zA-Z]',' ',messages['title'][i])
  review=review.lower()
  review=review.split()
  clean_words=[]
  for word in review:
    if word not in set(stopwords.words('english')):
      root=ps.stem(word)
      clean_words.append(root)
  review=' '.join(clean_words)
  corpus.append(review)

corpus


['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video',
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video',
 'bobbi jindal rais hindu use stori christian convers woo evangel potenti bid',
 'satan russia unv imag terrifi new supernuk western world take notic',
 'time christian group sue amazon splc design hate group',
 'dr ben carson target ir never audit spoke nation prayer breakfast',
 'hous intel chair trump russia fake stori evid anyth video',
 'sport bar owner ban nfl game show true american sport like speak rural america video',
 'latest pipelin leak underscor danger dakota access pipelin',
 'gop senat smack punchabl alt right nazi internet',
 'may brexit offer would hurt cost eu citizen eu parliament',
 'schumer call trump appoint offici overse puerto rico relief',
 'watch hilari ad call question health age clinton crime famili boss',
 'chang expect espn polit agenda despit huge subscrib declin breitbart'

In [14]:
# One hot encoding

from tensorflow.keras.preprocessing.text import one_hot

onehot_repr=[]
for i in corpus:
  encoded=one_hot(i,voc_size)
  onehot_repr.append(encoded)

onehot_repr

[[2466, 4195, 2911, 1856, 1408, 3330, 3785, 3906, 413, 1191, 4613, 423],
 [340,
  4315,
  3074,
  3639,
  4107,
  3787,
  1710,
  1909,
  1617,
  1290,
  3691,
  96,
  2267,
  423],
 [3690, 4001, 3481, 2178, 2446, 59, 4748, 1316, 1924, 1682, 3335, 138],
 [1175, 3564, 366, 574, 4503, 1319, 4822, 2796, 2251, 2437, 4326],
 [2181, 4748, 870, 3777, 2171, 4415, 2985, 380, 870],
 [1218, 2467, 1659, 1010, 2884, 950, 4948, 2959, 136, 1052, 2503],
 [4945, 569, 483, 4307, 3564, 816, 59, 3131, 1411, 423],
 [501,
  652,
  4498,
  2240,
  3266,
  1236,
  4806,
  2995,
  1718,
  501,
  2261,
  810,
  1944,
  1877,
  423],
 [1167, 4246, 3067, 242, 3876, 3580, 3652, 4246],
 [3890, 4054, 3846, 1996, 255, 2137, 1901, 187],
 [1411, 1010, 4536, 747, 1891, 4598, 2533, 1223, 2533, 4006],
 [1220, 1634, 4307, 4160, 1467, 433, 3513, 376, 4341],
 [573, 1133, 1479, 1634, 2057, 2055, 4439, 783, 2349, 3964, 4230],
 [506, 3437, 4525, 2978, 4351, 1582, 3341, 2988, 4916, 3218],
 [4634, 593, 1276, 1847, 2266, 4945, 467

In [15]:
## pre padding

sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 1191 4613  423]
 [   0    0    0 ...   96 2267  423]
 [   0    0    0 ... 1682 3335  138]
 ...
 [   0    0    0 ... 2244 1757  441]
 [   0    0    0 ... 4546 2589 4514]
 [   0    0    0 ...   78  783 3555]]


In [16]:
# Creating the model

embedding_vector_features=40 # Feature representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_shape=(sent_length,)))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

  super().__init__(**kwargs)


None


In [17]:
print(embedded_docs.shape,y.shape)

(71537, 20) (71537,)


In [18]:
import numpy as np
x_final=np.array(embedded_docs)
y_final=np.array(y)
print(embedded_docs.shape,y.shape)

(71537, 20) (71537,)


In [19]:
# Train Test Split

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_final,y_final,train_size=0.8,random_state=0)

In [20]:
# Training the model

model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 31ms/step - accuracy: 0.8216 - loss: 0.3777 - val_accuracy: 0.8980 - val_loss: 0.2454
Epoch 2/10
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 30ms/step - accuracy: 0.9170 - loss: 0.2079 - val_accuracy: 0.8986 - val_loss: 0.2393
Epoch 3/10
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 31ms/step - accuracy: 0.9270 - loss: 0.1823 - val_accuracy: 0.8998 - val_loss: 0.2420
Epoch 4/10
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 32ms/step - accuracy: 0.9374 - loss: 0.1583 - val_accuracy: 0.8945 - val_loss: 0.2567
Epoch 5/10
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 32ms/step - accuracy: 0.9465 - loss: 0.1377 - val_accuracy: 0.8944 - val_loss: 0.2770
Epoch 6/10
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 31ms/step - accuracy: 0.9566 - loss: 0.1138 - val_accuracy: 0.8936 - val_loss: 0.2897
Epoch 7/10
[1m8

<keras.src.callbacks.history.History at 0x785046dc9b10>

In [21]:
# Make prediction

y_pred=model.predict(x_test)
y_pred=np.where(y_pred>0.5,1,0)
y_pred

[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


array([[1],
       [1],
       [1],
       ...,
       [1],
       [0],
       [0]])

In [22]:
y_test

array([1, 1, 1, ..., 1, 1, 0])

In [23]:
# Confusion metrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[6026,  937],
       [ 619, 6726]])

In [24]:
#   Calculate accuracy

from sklearn.metrics import accuracy_score, classification_report

accuracy_score(y_test,y_pred)

0.8912496505451496

In [25]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      6963
           1       0.88      0.92      0.90      7345

    accuracy                           0.89     14308
   macro avg       0.89      0.89      0.89     14308
weighted avg       0.89      0.89      0.89     14308

