In [None]:
!pip install numpy
!pip install pandas

In [1]:
import pandas as pd
import nltk , string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



In [2]:
# pd.set_option('display.max_colwidth', None)
dataset = pd.read_csv('/kaggle/input/fake-news-classification/WELFake_Dataset.csv')
dataset = dataset.rename(columns={'Unnamed: 0':'index'})
dataset.head()

Unnamed: 0,index,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
# dropping null values

dataset = dataset.dropna(axis=0)

In [6]:
# preparing independent feature set for training and testing

X = dataset.drop(columns=['label'] , axis=1)
X.head()

Unnamed: 0,index,title,text
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ..."
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will..."
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...


In [7]:
# preparing dependent target variable for training and testing
y = dataset['label']
y

0        1
2        1
3        0
4        1
5        1
        ..
72129    0
72130    1
72131    0
72132    0
72133    1
Name: label, Length: 71537, dtype: int64

In [8]:
# we are taking only titles of news article to predict fake or real news
titles = X['title']
titles

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
5        About Time! Christian Group Sues Amazon and SP...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: title, Length: 71537, dtype: object

In [27]:
# Initialize the stemmer
stemmer = PorterStemmer()

# Get English stopwords
stop_words = set(stopwords.words('english'))

processed_titles = []

for sentence in titles:
    # Tokenize the sentence
    words = word_tokenize(sentence)

    # Remove stopwords and apply stemming
    cleaned_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Remove punctuation and lowercase the words
    cleaned_words = [word.lower() for word in cleaned_words if word not in string.punctuation]

    # Rejoin the cleaned words to form a sentence
    cleaned_sentence = ' '.join(cleaned_words)
    
    processed_titles.append(cleaned_sentence)



In [34]:
processed_titles_list = pd.Series(processed_titles)

In [30]:
import tensorflow as tf
tf.__version__

from tensorflow.keras.layers import Embedding , LSTM , Dense , Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [38]:
voc_size = 5000

# Assuming processed_titles_list is a list of strings
one_hot_representation = [one_hot(title, voc_size) for title in processed_titles_list]

print(one_hot_representation)

[[1899, 2743, 4907, 662, 477, 1098, 3844, 3087, 1326, 4459, 3841, 1674, 2511, 4564], [2428, 2439, 4035, 375, 4078, 3812, 1474, 4485, 3201, 1696, 4080, 3881, 406, 1880, 2358, 4999, 4564], [3078, 889, 1036, 2651, 507, 2838, 973, 405, 426, 205, 643, 4181, 2496], [3970, 298, 2609, 3335, 2610, 777, 413, 1408, 4696, 4035, 4712, 2159, 2689, 4661, 1400], [2768, 973, 641, 1551, 3440, 1600, 2098, 2958, 641], [24, 687, 2612, 963, 3306, 3201, 2672, 2284, 2159, 4756, 3234, 3288, 4080], [1493, 742, 145, 3510, 2609, 4597, 2838, 3201, 1718, 2803, 4080, 4564], [1373, 3837, 12, 2548, 513, 2783, 3571, 1408, 1147, 4188, 4035, 1373, 1408, 4035, 4877, 2775, 2071, 452, 4035, 4564], [1272, 1612, 2339, 4903, 1895, 1311, 449, 1612], [531, 901, 1550, 2543, 4492, 1010, 650, 2874], [952, 3849, 159, 4475, 3388, 4522, 4255, 2059, 4255, 787], [2481, 4014, 3510, 3076, 3521, 2120, 116, 2887, 2575], [3991, 2224, 2247, 4014, 3729, 1104, 3313, 1630, 3938, 2761, 1328], [4538, 1052, 4959, 720, 157, 4950, 3993, 3302, 1187, 4

In [39]:
# padding
sentence_length = 20

padded_one_hot_sentences = pad_sequences(one_hot_representation,padding='pre',maxlen=sentence_length)

padded_one_hot_sentences

array([[   0,    0,    0, ..., 1674, 2511, 4564],
       [   0,    0,    0, ..., 2358, 4999, 4564],
       [   0,    0,    0, ...,  643, 4181, 2496],
       ...,
       [   0,    0,    0, ..., 4132, 3112, 2811],
       [   0,    0,    0, ..., 3062, 3517, 4025],
       [   0,    0,    0, ..., 1006, 1630, 2581]], dtype=int32)

In [42]:
embedding_vector_dimentions = 50

# model architecture
model = Sequential()
model.add(Embedding(voc_size,embedding_vector_dimentions,input_length=sentence_length))
model.add(LSTM(100))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 50)            250000    
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 310,501
Trainable params: 310,501
Non-trainable params: 0
_________________________________________________________________
None


In [49]:
import numpy as np

X_final = np.array(padded_one_hot_sentences)
y_final = np.array(y)

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)



In [51]:
# model training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7cc00773f820>

In [58]:
y_pred_prob=model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype('int')



In [59]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[10317,  1376],
       [ 1115, 10800]])

In [60]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8944849203659776