### BiDirectional LSTM RNN FakeNews Classifier

In [1]:
import pandas as pd
import tensorflow as tf
import re

In [2]:
df = pd.read_csv('DataSets/FakeNewsDatas/train/train.csv')
# If Data Set Have Some Bad Lines IN Data
# df=pd.read_csv('DataSets/FakeNewsDatas/train/train.csv', engine='python', error_bad_lines=False)

In [3]:
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [4]:
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [6]:
nltk.download('punkt')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# So We have 20800 of records and we cant replace text data so droping 2000 null values will be good decision
# Drop Nan Values

df = df.dropna()


In [9]:
df.shape

(18285, 5)

In [10]:
# Get independent Features
X = df.drop('label', axis=1)

In [11]:
# Get The Dependent Feature
y = df['label']

In [12]:
X.shape

(18285, 4)

In [13]:
y.shape

(18285,)

In [14]:
# Vocabulary Size is for no. of unique words in corpus
voc_size = 5000

In [15]:
messages = X.copy()

In [16]:
messages.reset_index(inplace=True)

In [17]:
stemmer = PorterStemmer()
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]',' ', messages['title'][i]) # removing special char other than a,A to z,Z, 0-9 
    review = review.lower()
    words = review.split() # sent tokenization
    review = [stemmer.stem(word) for word in words if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
#     messages.loc[i,'title']= review
    corpus.append(review)
    

In [18]:
# one hot 
onehot_repr = [one_hot(words, voc_size ) for words in corpus]
onehot_repr

[[4530, 666, 4696, 1415, 2982, 4700, 2219, 320, 998, 3501],
 [2873, 382, 2840, 695, 4791, 2265, 3025],
 [3592, 155, 1658, 948],
 [2142, 3794, 1091, 4895, 971, 2774],
 [1879, 4791, 1937, 791, 2628, 1147, 4791, 4528, 1078, 4540],
 [1369,
  2759,
  4574,
  4026,
  1419,
  2775,
  2276,
  1748,
  4034,
  2805,
  1266,
  4153,
  1078,
  1182,
  3025],
 [871, 981, 1148, 839, 3352, 2860, 888, 534, 25, 964, 2686],
 [383, 4959, 2551, 4217, 4272, 2486, 2775, 4117, 25, 964, 2686],
 [2242, 4506, 1997, 508, 3913, 4, 2814, 1893, 2775, 519],
 [1617, 1751, 2681, 97, 3304, 267, 3386, 3077],
 [3573, 3740, 4768, 3787, 1953, 3809, 3196, 3273, 3013, 2565, 311],
 [4895, 3570, 2982, 4, 2775, 4272],
 [3403, 4973, 2699, 613, 1845, 2742, 4387, 3738, 475],
 [2922, 3817, 276, 193, 3482, 4627, 2783, 25, 964, 2686],
 [3879, 1604, 2038, 2179, 3423, 25, 964, 2686],
 [1910, 2236, 3597, 181, 427, 1571, 45, 2770, 2054, 4279],
 [3676, 3624, 382],
 [3095, 2996, 3402, 4338, 2775, 3923, 967, 3025],
 [4250, 4669, 2840, 2737,

In [19]:
### Embedding Representation


In [20]:
sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

In [21]:
print(embedded_docs)

[[   0    0    0 ...  320  998 3501]
 [   0    0    0 ... 4791 2265 3025]
 [   0    0    0 ...  155 1658  948]
 ...
 [   0    0    0 ...   25  964 2686]
 [   0    0    0 ... 4920 4652 2683]
 [   0    0    0 ... 3452 2718 2688]]


In [22]:
## creating model
voc_size = 5000
embedding_vector_features = 40 #Feature Representation
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features))
model.add(Bidirectional(LSTM(100))) # LSTM with 100 neurons
# output is binary we will use sigmoid activation function
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

None


In [23]:
import numpy as np
X_final = np.array(embedded_docs)
y_final=np.array(y)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [25]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.7786 - loss: 0.4126 - val_accuracy: 0.9171 - val_loss: 0.1973
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9467 - loss: 0.1408 - val_accuracy: 0.9185 - val_loss: 0.2032
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9679 - loss: 0.0926 - val_accuracy: 0.9135 - val_loss: 0.2093
Epoch 4/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9775 - loss: 0.0674 - val_accuracy: 0.9135 - val_loss: 0.2509
Epoch 5/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9847 - loss: 0.0470 - val_accuracy: 0.9047 - val_loss: 0.2987
Epoch 6/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9901 - loss: 0.0335 - val_accuracy: 0.9095 - val_loss: 0.3066
Epoch 7/10
[1m192/192

<keras.src.callbacks.history.History at 0x166eae1f810>

In [26]:
y_pred=model.predict(X_test)

[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [27]:
y_pred=np.where(y_pred > 0.5, 1,0) ##AUC ROC Curve

In [28]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[3149,  270],
       [ 336, 2280]], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8995857497928749

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      3419
           1       0.89      0.87      0.88      2616

    accuracy                           0.90      6035
   macro avg       0.90      0.90      0.90      6035
weighted avg       0.90      0.90      0.90      6035

