In [173]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from sklearn.metrics import confusion_matrix

In [80]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [147]:
df = pd.read_csv('/content/WELFake_Dataset.csv', on_bad_lines='skip', engine="python")

In [148]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [149]:
df.shape

(72154, 4)

In [150]:
df.dtypes

Unnamed: 0    object
title         object
text          object
label         object
dtype: object

In [151]:
df = df[df['label'].isin(['1', '0'])]
df['label'] = df['label'].astype(int)

In [152]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [153]:
df.dropna(inplace=True)

In [154]:
X = df.drop(['label', 'Unnamed: 0'], axis=1)
X.reset_index(inplace=True)

In [155]:
df['label'].unique()

array([1, 0])

In [156]:
y = df['label']

In [157]:
# Text Preprocessing

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0,len(X)):
  review = re.sub('[^a-zA-Z]',' ',X['title'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)


In [158]:
corpus[1]

'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'

In [159]:
# onehot representation
voc_size = 5000 #number of unique words
onehot_rep = [one_hot(words,voc_size)for words in corpus]

In [160]:
onehot_rep[1]

[2912,
 3711,
 888,
 3270,
 1375,
 3892,
 1600,
 1193,
 510,
 3502,
 4184,
 3307,
 2910,
 2330]

In [161]:
# Padding
input_len = 20
embedded_doc = pad_sequences(onehot_rep,padding='post',maxlen=input_len)

In [162]:
embedded_doc[0]

array([4497, 1999, 4821, 3279, 4892, 2228,  338, 3883,  102, 3372, 1209,
       2330,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [163]:
# Model Building
model = Sequential()
model.add(Embedding(input_dim=voc_size,output_dim=40,input_length=input_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())



Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_2 (LSTM)               (None, 100)               56400     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256501 (1001.96 KB)
Trainable params: 256501 (1001.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [164]:
X_final = np.array(embedded_doc)
y_final = np.array(y)

In [165]:
X_final.shape,y_final.shape

((71535, 20), (71535,))

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [167]:
# Model Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x781401e0a140>

In [168]:
# Performance Metrics And Accuracy
y_pred = model.predict(X_test)



In [169]:
y_pred

array([[1.6433645e-04],
       [3.0428752e-01],
       [9.9686199e-01],
       ...,
       [9.9984705e-01],
       [9.9751365e-01],
       [9.7766364e-01]], dtype=float32)

In [170]:
y_pred=np.where(y_pred > 0.6, 1,0)

In [171]:
y_pred

array([[0],
       [0],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [174]:
confusion_matrix(y_test,y_pred)

array([[10219,  1416],
       [ 1325, 10647]])

In [175]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8838903714999788

In [176]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88     11635
           1       0.88      0.89      0.89     11972

    accuracy                           0.88     23607
   macro avg       0.88      0.88      0.88     23607
weighted avg       0.88      0.88      0.88     23607

