<a href="https://colab.research.google.com/github/lokesharma-dev/Fake-News-Detection/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import re
import time
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Load data from local file system

In [15]:
from google.colab import files
uploaded = files.upload()

Saving celebrityDataset.csv to celebrityDataset.csv


In [0]:
import io
df = pd.read_csv(io.StringIO(uploaded['celebrityDataset.csv'].decode('utf-8')))

In [0]:
# Extract relevant features
df.nunique()
df.isna().sum()
df['Subject'].fillna('',inplace=True) # Replace all missing values
x = df['Subject'] + " " + df['Content']
#y = pd.get_dummies(df['Label'])
y = [1 if row == 'Fake' else 0 for row in df['Label']]
y = np.array(y) # Dummy Encoding

Define Clean corpus funcion and calls

In [22]:
# Clean the texts
def clean_text(text, remove_stopwords=True):
    output = ""
    text = str(text).replace(r'http[\w:/\.]+', '') # removing urls
    text = str(text).replace(r'[^\.\w\s]', '') # removing everything but characters and punctuation
    text = str(text).replace(r'\.\.+', '.') # replace multiple periods with a single one
    text = str(text).replace(r'\.', ' . ') # replace periods with a single one
    text = str(text).replace(r'\s\s++', ' ') # replace multiple whitespace with one
    text = str(text).replace(r'\n', '') # removing line break
    text = re.sub(r'[^\w\s]', '', text.lower()) # lower texts
    if remove_stopwords:
        text = text.split(" ")
        for word in text:
            if word not in stopwords.words('english'):
                output = output + " " + word
    return output

# Clean the corpus
start = time.time()
docs = [clean_text(row) for row in x]
end = time.time()
print("Cleaning the document took {} seconds".format(round(end - start)))

Cleaning the document took 19 seconds


Parameter value declaration

In [0]:
# Parameters
MAX_VOCAB_SIZE = 1000000 # maximum no of unique words
MAX_DOC_LENGTH = 500 # maximum no of words in each sentence
EMBEDDING_DIM = 50 # Embeddings dimension from Glove directory
GLOVE_DIR = 'models/glove.6B/glove.6B.' + str(EMBEDDING_DIM) + 'd.txt'
TEST_SPLIT = 0.2

Tokenize and Pad Sequences


In [58]:
# Tokenize & pad sequences
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(docs)
encoded_docs = tokenizer.texts_to_sequences(docs)
word_index = tokenizer.word_index
print('Vocabulary size :', len(word_index))
sequences = pad_sequences(encoded_docs, padding='post', maxlen=MAX_DOC_LENGTH)
print('Shape of data tensor:', sequences.shape)
print('Shape of label tensor', y.shape)

Vocabulary size : 20567
Shape of data tensor: (500, 500)
Shape of label tensor (500,)


Split dataset into train and test


In [0]:
# Shuffle data random before splitting
indices = np.arange(sequences.shape[0])
np.random.shuffle(indices)
data = sequences[indices]
labels = y[indices]

In [0]:
# Split into test set
num_test_samples = int(TEST_SPLIT*data.shape[0])
x_train = data[:-num_test_samples]
y_train = labels[:-num_test_samples]
x_test = data[-num_test_samples:]
y_test = labels[-num_test_samples:]

LSTM

In [200]:
model = Sequential()
model.add(Embedding(input_dim=len(word_index)+1, output_dim=EMBEDDING_DIM, input_length=MAX_DOC_LENGTH))
model.add(LSTM(units=50))
model.add(Dense(units=1, activation='softmax'))
model.summary()
# Train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # only compilation
history = model.fit(x_train, y_train, epochs=5, batch_size=10, validation_split=0.2)

Model: "sequential_44"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_42 (Embedding)     (None, 500, 50)           1028400   
_________________________________________________________________
lstm_40 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 51        
Total params: 1,048,651
Trainable params: 1,048,651
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Test Accuracy

In [201]:
score, acc = model.evaluate(x_test, y_test, batch_size=10)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 7.929604530334473
Test accuracy: 0.47999998927116394


In [195]:
test = x_test[1].reshape(1,len(x_test[1]))
test.shape
y_hat = model.predict(test)
len(y_hat), y_hat, y_test[1]

(1, 500)