In [1]:
import pandas as pd

In [22]:
# loading a dataset:
df = pd.read_csv('dataset/fake_news/train.csv')

In [23]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [24]:
# dropping Nan values
df = df.dropna()
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [32]:
# getting independent feature
X = df.drop('label', axis=1)

# getting dependent feature
y = df['label']
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 18285, dtype: int64

In [36]:
y.value_counts()

label
0    10361
1     7924
Name: count, dtype: int64

In [38]:
X.shape

(18285, 4)

In [40]:
y.shape

(18285,)

In [45]:
import tensorflow as tf

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import (Dense, LSTM, Bidirectional)

In [47]:
# vocabulary size
voc_size = 5000

# OneHot Representation

In [50]:
messages = X.copy()

In [56]:
messages['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [58]:
messages.reset_index(inplace=True)

In [60]:
import nltk
import re
from nltk.corpus import stopwords

In [62]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data PreProcessing

In [67]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
    review = re.sub('^a-zA-Z', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [68]:
onehot_rep = [one_hot(words, voc_size) for words in corpus]


# Embedding Representation

In [74]:
sent_length = 20
embedded_docs = pad_sequences(onehot_rep, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 2659 2347 3918]
 [   0    0    0 ... 1113 1881 4417]
 [   0    0    0 ... 3588  731 3317]
 ...
 [   0    0    0 ... 1896 1281 3460]
 [   0    0    0 ... 1604 4665 4364]
 [   0    0    0 ... 4821 4868 2593]]


In [80]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0, 2339, 3631,
        148, 1957, 2834,  955, 4174,  176, 2659, 2347, 3918])

In [82]:
# creating model:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [84]:
len(embedded_docs), y.shape

(18285, (18285,))

# Train Test Split

In [87]:
import numpy as np
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [89]:
X_final.shape, y_final.shape

((18285, 20), (18285,))

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

# Model Training

In [95]:
# training
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 85ms/step - accuracy: 0.7835 - loss: 0.4330 - val_accuracy: 0.9137 - val_loss: 0.1987
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 68ms/step - accuracy: 0.9472 - loss: 0.1313 - val_accuracy: 0.9072 - val_loss: 0.2074
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 63ms/step - accuracy: 0.9705 - loss: 0.0852 - val_accuracy: 0.9162 - val_loss: 0.2169
Epoch 4/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 72ms/step - accuracy: 0.9828 - loss: 0.0572 - val_accuracy: 0.9133 - val_loss: 0.2966
Epoch 5/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 78ms/step - accuracy: 0.9881 - loss: 0.0409 - val_accuracy: 0.9085 - val_loss: 0.3366
Epoch 6/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 54ms/step - accuracy: 0.9926 - loss: 0.0250 - val_accuracy: 0.9120 - val_loss: 0.4503
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x257562a6270>

# Performance Metrics And Accuracy

In [98]:
y_predict = model.predict(X_test)

[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step


In [105]:
print("y_test:", y_test[:5])
print("y_predict:", y_predict[:5])
print("y_test shape:", y_test.shape)
print("y_predict shape:", y_predict.shape)


y_test: [1 0 0 0 1]
y_predict: [[9.9993408e-01]
 [4.9391692e-05]
 [5.3062621e-03]
 [9.9550271e-01]
 [9.9959135e-01]]
y_test shape: (6035,)
y_predict shape: (6035, 1)


In [107]:
# y_test contains binary class labels (e.g., 0 and 1), while y_predict contains continuous probability values (ranging between 0 and 1)
# To resolve the issue, you need to threshold y_predict to convert it into binary class labels before computing the confusion matrix.

# Converting the continuous probabilities in y_predict into binary class labels using a threshold (commonly 0.5)
y_predict = (y_predict > 0.5).astype("int32").flatten()


In [109]:
from sklearn.metrics import confusion_matrix

# calculating confusion matrix
confusion_matrix(y_test, y_predict)

array([[3127,  292],
       [ 287, 2329]], dtype=int64)

In [113]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(y_test, y_predict)

0.904059652029826

In [119]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3419
           1       0.89      0.89      0.89      2616

    accuracy                           0.90      6035
   macro avg       0.90      0.90      0.90      6035
weighted avg       0.90      0.90      0.90      6035

