In [1]:
import pandas as pd

In [5]:
# loading data from kaggle:
# link: https://www.kaggle.com/competitions/fake-news/data

df = pd.read_csv('dataset/fake_news/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
# dropping or removing nan values
df = df.dropna()

In [9]:
# geting independent feature
X = df.drop('label', axis=1)

# getting dependent feature
y = df['label']

In [11]:
X.shape

(18285, 4)

In [13]:
y.shape

(18285,)

In [15]:
# importing teensorflow
import tensorflow as tf

In [17]:
tf.__version__

'2.18.0'

In [19]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense

In [21]:
# vocabulary size
voc_size = 5000

# One hot representation

In [24]:
messages = X.copy()

In [26]:
messages.reset_index(inplace=True)

In [28]:
messages

Unnamed: 0,index,id,title,author,text
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...
...,...,...,...,...,...
18280,20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...
18281,20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...
18282,20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...
18283,20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal..."


In [30]:
import nltk
import re
from nltk.corpus import stopwords

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
# Data Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [56]:
# corpus

In [58]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
# onehot_repr

# Embedding Representation

In [51]:
sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 3338 2368 3052]
 [   0    0    0 ... 1590 4265 1669]
 [   0    0    0 ... 4668  739  145]
 ...
 [   0    0    0 ... 1964 3934  783]
 [   0    0    0 ... 3660  863  171]
 [   0    0    0 ...  911 4779  440]]


In [53]:
len(embedded_docs)

18285

# creating model

In [60]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())




None


In [62]:
len(embedded_docs), y.shape

(18285, (18285,))

In [65]:
import numpy as np
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [67]:
X_final.shape, y_final.shape

((18285, 20), (18285,))

In [69]:
# Splitting into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

# Model Training

In [74]:
# finally training
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 45ms/step - accuracy: 0.7858 - loss: 0.4241 - val_accuracy: 0.9173 - val_loss: 0.1960
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.9451 - loss: 0.1396 - val_accuracy: 0.9171 - val_loss: 0.2010
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 44ms/step - accuracy: 0.9625 - loss: 0.1012 - val_accuracy: 0.9143 - val_loss: 0.2223
Epoch 4/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.9755 - loss: 0.0715 - val_accuracy: 0.9147 - val_loss: 0.2424
Epoch 5/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 0.9840 - loss: 0.0526 - val_accuracy: 0.9094 - val_loss: 0.2939
Epoch 6/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 35ms/step - accuracy: 0.9862 - loss: 0.0429 - val_accuracy: 0.9049 - val_loss: 0.3554
Epoch 7/10
[1m192/

<keras.src.callbacks.history.History at 0x28f0d391400>

# Performance Metrics And Accuracy

In [87]:
y_predict = model.predict(X_test)


[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step


In [93]:
print("y_test:", y_test[:5])
print("y_predict:", y_predict[:5])
print("y_test shape:", y_test.shape)
print("y_predict shape:", y_predict.shape)


y_test: [1 0 0 0 1]
y_predict: [[9.9999380e-01]
 [1.7169224e-07]
 [6.8982114e-04]
 [5.8142090e-05]
 [9.9998832e-01]]
y_test shape: (6035,)
y_predict shape: (6035, 1)


In [101]:
# y_test contains binary class labels (e.g., 0 and 1), while y_predict contains continuous probability values (ranging between 0 and 1)
# To resolve the issue, you need to threshold y_predict to convert it into binary class labels before computing the confusion matrix.

# Converting the continuous probabilities in y_predict into binary class labels using a threshold (commonly 0.5)
y_predict = (y_predict > 0.5).astype("int32").flatten()


In [97]:
from sklearn.metrics import confusion_matrix

In [103]:
# calculating confusion matrix
confusion_matrix(y_test, y_predict)

array([[3094,  325],
       [ 246, 2370]], dtype=int64)

In [105]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9053852526926264