In [1]:
## Fake news classifier using bi-directional LSTM
import pandas as pd
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [2]:
df = df.dropna()

In [4]:
## get the independent and dependent features
X = df.drop('label', axis=1)
y = df['label']

In [5]:
import tensorflow as tf

In [6]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional

In [9]:
## Vocabulary size
voc_size = 5000
## One hot Representation
messages = X.copy()

In [10]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Data Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', " ", messages['title'].iloc[i])
    # Convert to lowercase
    review = review.lower()
    # Tokenize
    review = review.split()
    # Remove stopwords and apply stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    # Rejoin tokens
    review = ' '.join(review)
    corpus.append(review)

In [12]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
onehot_repr

[[1107, 2779, 2715, 379, 4233, 3851, 4773, 375, 4820, 2623],
 [868, 4618, 240, 56, 3674, 306, 826],
 [1000, 735, 3310, 2234],
 [126, 4383, 4931, 1466, 1010, 160],
 [3549, 3674, 240, 2173, 2766, 1878, 3674, 4849, 3896, 2716],
 [2834,
  3659,
  880,
  4599,
  4549,
  2804,
  3898,
  2019,
  3766,
  4876,
  1082,
  2030,
  1929,
  911,
  826],
 [3288, 4571, 434, 3361, 1937, 4401, 3311, 4086, 4807, 4981, 1569],
 [2384, 2474, 969, 1942, 616, 2640, 2804, 1984, 4807, 4981, 1569],
 [1420, 4927, 2185, 587, 491, 2025, 4988, 4826, 2804, 2331],
 [2801, 1915, 3139, 2943, 3713, 1307, 2685, 4280],
 [2690, 1855, 1959, 490, 2698, 4008, 2089, 3771, 4325, 3779, 1288],
 [1466, 1405, 4233, 2025, 2804, 616],
 [2579, 1808, 4949, 266, 4553, 2405, 2182, 4587, 1980],
 [3944, 922, 3955, 3119, 2997, 705, 1382, 4807, 4981, 1569],
 [4846, 3375, 1487, 3135, 2629, 4807, 4981, 1569],
 [4317, 621, 4981, 3288, 2694, 2645, 3297, 4350, 2039, 3625],
 [4722, 2433, 4618],
 [528, 687, 229, 900, 2804, 3421, 4937, 826],
 [2330,

In [13]:
## Word2Vec to Embedding layer
sent_length = 30
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...  375 4820 2623]
 [   0    0    0 ... 3674  306  826]
 [   0    0    0 ...  735 3310 2234]
 ...
 [   0    0    0 ... 4807 4981 1569]
 [   0    0    0 ... 4984 3128 2739]
 [   0    0    0 ... 1638 1181 2446]]


In [14]:
##Creating the model
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



None


In [15]:
import numpy as np
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)


In [17]:
##fittinf the training
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 53ms/step - accuracy: 0.7384 - loss: 0.4555 - val_accuracy: 0.9092 - val_loss: 0.1995
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 69ms/step - accuracy: 0.9403 - loss: 0.1484 - val_accuracy: 0.9168 - val_loss: 0.1958
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 62ms/step - accuracy: 0.9568 - loss: 0.1072 - val_accuracy: 0.9123 - val_loss: 0.2222
Epoch 4/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 45ms/step - accuracy: 0.9723 - loss: 0.0737 - val_accuracy: 0.9148 - val_loss: 0.2487
Epoch 5/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 44ms/step - accuracy: 0.9839 - loss: 0.0474 - val_accuracy: 0.9095 - val_loss: 0.2936
Epoch 6/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 62ms/step - accuracy: 0.9895 - loss: 0.0333 - val_accuracy: 0.9102 - val_loss: 0.3574
Epoch 7/10
[1m192

<keras.src.callbacks.history.History at 0x207d4aba150>

In [18]:
y_pred = model.predict(X_test)
y_pred = np.where(y_pred>0.5, 1, 0)

[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step


In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[3104,  315],
       [ 249, 2367]], dtype=int64)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9065451532725767

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3419
           1       0.88      0.90      0.89      2616

    accuracy                           0.91      6035
   macro avg       0.90      0.91      0.91      6035
weighted avg       0.91      0.91      0.91      6035

