In [1]:
# import all the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load the data
data = pd.read_csv('fake-news-train.csv')

In [3]:
# display the first five rows of the dataset
print(data.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [4]:
# display the last five rows of the dataset
print(data.tail())

          id                                              title  \
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
20795                              Jerome Hudson   
20796                           Benjamin Hoffman   
20797  Michael J. de la Merced and Rachel Abrams   
20798                                Alex Ansary   
20799                              David Swanson   

                                                    text  label  
20795  Rapper T. I. unloaded on black celebrities who...      0  
20796  When the Green Bay Packers lost to the Washing...      0  
20797  The Macy’s of today grew from the union of sev...      0  
20798  NATO, Russia 

In [5]:
# display the shape of the dataset
print(data.shape)

(20800, 5)


In [6]:
# depict the detailed information about all the columns in the entire dataset
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB
None


In [7]:
# To depict the descriptive summary for the entire dataset
print(data.describe())

                 id         label
count  20800.000000  20800.000000
mean   10399.500000      0.500625
std     6004.587135      0.500012
min        0.000000      0.000000
25%     5199.750000      0.000000
50%    10399.500000      1.000000
75%    15599.250000      1.000000
max    20799.000000      1.000000


In [8]:
# To check if there are any NULL Values in the dataset
print(data.isnull().sum())

id           0
title      558
author    1957
text        39
label        0
dtype: int64


# Observation:

From the above observation, it is clear that there are 3 columns i.e. title, author and text columns in the dataset that are not null.

In [9]:
# Drop the NAN Values
data = data.dropna()

In [10]:
print(data.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [11]:
# To check if there are any NULL Values in the dataset
print(data.isnull().sum())

id        0
title     0
author    0
text      0
label     0
dtype: int64


# Observation:

There are now no NULL Values in the dataset.

In [12]:
# Get all the independent features
X = data.drop(columns = 'label', axis = 1)
print(X)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [13]:
Y = data['label']

In [14]:
print(Y)

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 18285, dtype: int64


In [15]:
print(X.shape)

(18285, 4)


In [16]:
print(Y.shape)

(18285,)


In [17]:
import tensorflow as tf

In [18]:
tf.__version__

'2.7.0'

In [19]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [20]:
# Vocabulary size
voc_size = 5000

# One Hot Representation

In [21]:
messages = X.copy()

In [22]:
messages

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...
...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal..."


In [23]:
# place the indexes in the messages dataset
messages.reset_index(inplace = True)

In [24]:
messages.head()

Unnamed: 0,index,id,title,author,text
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [25]:
answer = messages.head(100)

In [26]:
# Performing the Data Preprocessing on the Corpus text
import nltk
import re
from nltk.corpus import stopwords

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
# Preprocessing of the data
from nltk.stem.porter import PorterStemmer

# create the object for porter stemmer
ps = PorterStemmer()

corpus = []

for i in range(0, len(answer)):
    print(i)
    # remove all the special characters with ' '
    review = re.sub('[^a-zA-Z]', ' ', answer['title'][i])
    # convert the text into lower case
    review = review.lower()
    # split the text into words
    review = review.split()
    
    
    # extract all the non stop words and perform the stemming operation on it
    review = [ps.stem(word) for word in review if(word not in stopwords.words('english'))]
    # join all the words in the review
    review = ' '.join(review)
    # append all the words in the corpus
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [29]:
print(corpus)

['hous dem aid even see comey letter jason chaffetz tweet', 'flynn hillari clinton big woman campu breitbart', 'truth might get fire', 'civilian kill singl us airstrik identifi', 'iranian woman jail fiction unpublish stori woman stone death adulteri', 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart', 'beno hamon win french socialist parti presidenti nomin new york time', 'back channel plan ukrain russia courtesi trump associ new york time', 'obama organ action partner soro link indivis disrupt trump agenda', 'bbc comedi sketch real housew isi caus outrag', 'russian research discov secret nazi militari base treasur hunter arctic photo', 'us offici see link trump russia', 'ye paid govern troll social media blog forum websit', 'major leagu soccer argentin find home success new york time', 'well fargo chief abruptli step new york time', 'anonym donor pay million releas everyon arrest dakota access pipelin', 'fbi close hillari', 'chuck todd

In [30]:
# convert the words into one hot representation
onehot_representation = [one_hot(words, voc_size) for words in corpus]

In [31]:
print(onehot_representation)

[[4307, 14, 480, 4130, 2480, 1239, 549, 620, 1205, 3360], [950, 1462, 3580, 651, 4968, 4606, 3631], [329, 4157, 1938, 3790], [658, 4199, 1460, 3864, 3607, 178], [1858, 4968, 3634, 1475, 2453, 2395, 4968, 3491, 3263, 534], [652, 4449, 4257, 3528, 2892, 4867, 1153, 4339, 3567, 2434, 1849, 4284, 4951, 2831, 3631], [3008, 3969, 338, 185, 1124, 3905, 3952, 3100, 1933, 1714, 1344], [4201, 4430, 4587, 2496, 3688, 4838, 4867, 3908, 1933, 1714, 1344], [4883, 1217, 2312, 2882, 4502, 766, 3394, 3, 4867, 2346], [2333, 2649, 2173, 1391, 4349, 4275, 1327, 2491], [2229, 4750, 3790, 3042, 1595, 115, 4434, 1155, 1474, 1057, 1817], [3864, 2322, 2480, 766, 4867, 3688], [1456, 3200, 4939, 3553, 1348, 4233, 4503, 4083, 959], [1416, 93, 3728, 833, 3161, 4086, 2914, 1933, 1714, 1344], [3669, 1298, 1824, 2738, 341, 1933, 1714, 1344], [4502, 1560, 2544, 764, 524, 1752, 466, 281, 2826, 607], [3479, 3574, 1462], [4626, 1168, 1366, 981, 4867, 220, 4846, 3631], [3179, 3803, 3580, 2794, 4699, 2514, 1718, 4526, 2395

In [32]:
# Embedding Representation

In [33]:
sent_length = 20
embedding_padding_representation = pad_sequences(onehot_representation, padding = 'pre', maxlen = sent_length)

In [34]:
print(embedding_padding_representation)

[[   0    0    0 ...  620 1205 3360]
 [   0    0    0 ... 4968 4606 3631]
 [   0    0    0 ... 4157 1938 3790]
 ...
 [   0    0    0 ... 1933 1714 1344]
 [   0    0    0 ... 2813  704 3135]
 [   0    0    0 ...  903 1316 2269]]


In [35]:
# padding representation at index 0 
print(embedding_padding_representation[0])

[   0    0    0    0    0    0    0    0    0    0 4307   14  480 4130
 2480 1239  549  620 1205 3360]


# Create the LSTM Model for the embedding layer

In [40]:
embedding_vector_feature = 40
# initialize the sequential layer
model = Sequential()
# add the embedding layer into the model
model.add(Embedding(voc_size, embedding_vector_feature, input_length = sent_length))
# add the LSTM Layer into the model
model.add(LSTM(100))
# Add the dense layer into the model
model.add(Dense(1, activation = 'sigmoid'))
# compile the model
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
# get the summary of the model
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_2 (LSTM)               (None, 100)               56400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [48]:
len(embedding_padding_representation)

100

In [49]:
print(Y)

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 18285, dtype: int64


In [50]:
Y.shape

(18285,)

In [51]:
Y = Y.head(100)

In [52]:
Y

0      1
1      0
2      1
3      1
4      1
      ..
108    0
109    0
110    0
111    0
112    1
Name: label, Length: 100, dtype: int64

In [54]:
Y.shape

(100,)

In [55]:
# convere the independent and dependent features into array
X_final = np.array(embedding_padding_representation)
Y_final = np.array(Y)

In [57]:
print(X_final.shape)
print(Y_final.shape)

(100, 20)
(100,)


In [59]:
# Divide the independent and dependent features into training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y_final, test_size = 0.33, random_state = 42)

# Model Training

In [60]:
model.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x224f1cb05e0>

# Adding the Dropout layer

In [61]:
from tensorflow.keras.layers import Dropout

embedding_vector_feature = 40
# initialize the sequential layer
model = Sequential()
# add the embedding layer into the model
model.add(Embedding(voc_size, embedding_vector_feature, input_length = sent_length))
# add the LSTM Layer into the model
model.add(LSTM(100))
# add the dropout layer into the LSTM Model to reduce the number of neurons
model.add(Dropout(0.3))
# Add the dense layer into the model
model.add(Dense(1, activation = 'sigmoid'))
# compile the model
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
# get the summary of the model
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_3 (LSTM)               (None, 100)               56400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


# Performance Metrics and Accuracy

In [62]:
from sklearn.metrics import confusion_matrix

In [63]:
Y_pred = model.predict(X_test)

In [81]:
cm = confusion_matrix(Y_test, Y_pred.round())

In [82]:
print(cm)

[[ 4 14]
 [ 4 11]]


In [83]:
from sklearn.metrics import accuracy_score
ac = accuracy_score(Y_test,Y_pred.round(), normalize = False)

In [84]:
print(ac)

15


In [87]:
from sklearn.metrics import classification_report
cr = classification_report(Y_test, Y_pred.round())
print(cr)

              precision    recall  f1-score   support

           0       0.50      0.22      0.31        18
           1       0.44      0.73      0.55        15

    accuracy                           0.45        33
   macro avg       0.47      0.48      0.43        33
weighted avg       0.47      0.45      0.42        33

