In [82]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [72]:
df = pd.read_csv('fake-news/train.csv')
df.head()
df = df.dropna()
df.reset_index(inplace=True)
X = df.drop('label', axis=1)
y = df['label']

display(X.shape)
display(y.shape)

(18285, 5)

(18285,)

In [73]:
# Defining the vocabulary size
vocabulary_size = 5000
messages = X.copy()

# Data cleaning/pre-processing - stemming
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    msg = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    msg = msg.lower()
    msg = msg.split()
    msg = [ps.stem(word) for word in msg if word not in stopwords.words('english')]
    msg = ' '.join(msg)
    corpus.append(msg)

In [74]:
# Applying one_hot representation
corpus_one_hot = [one_hot(words,vocabulary_size) for words in corpus]
display(corpus_one_hot)

[[3805, 1382, 2999, 4787, 2418, 3756, 328, 4567, 2826, 1666],
 [3315, 1773, 234, 3507, 1264, 3465, 4258],
 [1894, 1660, 1154, 3389],
 [1491, 1066, 517, 4044, 3354, 1188],
 [2245, 1264, 3041, 2186, 4970, 4824, 1264, 2626, 2870, 1573],
 [782,
  2664,
  957,
  3821,
  1577,
  992,
  1571,
  3649,
  4269,
  2725,
  2233,
  3610,
  4550,
  260,
  4258],
 [2196, 1061, 3016, 3965, 14, 2057, 3478, 1149, 4707, 2767, 4675],
 [406, 4776, 4270, 3713, 74, 3148, 992, 365, 4707, 2767, 4675],
 [215, 1314, 2380, 1290, 2446, 635, 1570, 1014, 992, 3101],
 [2252, 4702, 707, 2595, 4408, 628, 4114, 1288],
 [575, 3445, 2196, 4996, 3859, 1920, 3951, 4461, 1561, 899, 4849],
 [4044, 21, 2418, 635, 992, 74],
 [3473, 1391, 3753, 1818, 1639, 442, 2837, 1080, 571],
 [3879, 20, 3942, 1988, 1420, 1472, 4844, 4707, 2767, 4675],
 [4899, 3195, 882, 2307, 672, 4707, 2767, 4675],
 [127, 3574, 1314, 2044, 3600, 598, 1043, 1771, 2793, 4537],
 [3197, 1041, 1773],
 [1502, 3231, 346, 3130, 992, 2923, 1931, 4258],
 [1196, 4459,

In [75]:
# Applying padding to all the sentences to a max-length of 20.
# This means word is embedding 

sent_length = 20
corpus_pad = pad_sequences(corpus_one_hot, padding='pre', maxlen=sent_length)

In [83]:
emb_vec_features = 40
model = Sequential()
# input_dim, output_dim, and input_length
model.add(Embedding(vocabulary_size, emb_vec_features, input_length=sent_length))
model.add(Dropout(0.3))
# 1 LSTM layer with 100 neurons
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
display(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 20, 40)            0         
                                                                 
 lstm_2 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


None

In [84]:
# Saving the embedded array into X_final and the dependent feature to y_final
X_final = np.array(corpus_pad)
y_final = np.array(y)

In [85]:
# Performing train-test-split for model prediction & evaluation
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [86]:
# Fitting the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f12340cae50>

In [88]:
# Model evalutaion
y_pred = (model.predict(X_test) > 0.5).astype("int32")

cm = confusion_matrix(y_test, y_pred)
display('Confusion Matrix:\n\n', cm)

ac = accuracy_score(y_test, y_pred)
display('\n\nAccuracy Score:', ac)

'Confusion Matrix:\n\n'

array([[3115,  304],
       [ 210, 2406]])

'\n\nAccuracy Score:'

0.9148301574150787