## LSTM and BiLSTM

In [None]:
#import the necessary libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
import string 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import keras
from keras.preprocessing import text,sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout

#nltk.download("stopwords")
from nltk.corpus import stopwords

nltk.download('all')
from tensorflow import keras
from tensorflow.keras import layers

### Exploratory data analysis

In [None]:
print("True news")
true = pd.read_csv("Downloads/true.csv")
true.head()

In [None]:
print("Fake news")
fake = pd.read_csv("Downloads/fake.csv")
fake.head()

### Add a new column for classification of true and fake news and naming it as label 

In [None]:
true['label'] = 1
fake['label'] = 0

In [None]:
true.head()

In [None]:
fake.head()

### Joining true and fake news to make the complete dataset to train the model 

In [None]:
df = pd.concat([true, fake], ignore_index = True, sort = False)
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#number of true and fake news
print("Number of true news articles:", len(true))
print("Number of fake news articles:", len(fake))

In [None]:
#number of different types of subjects for news and their counts
print(df.subject.value_counts())

In [None]:
#combining necessay attributes into one attribute and dropping the ones that are not considered for
#the classification
df['text'] = df['subject'] + " " + df['title'] + " " + df['text']
del df['title']
del df['subject']
del df['date']
df.head()

In [None]:
#checking for a random dataset
example = df.text[5]
example

### Removal of HTML content if any from the example initially

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(example, "html.parser")
example = soup.get_text()
example

### Removal of punctuations and special characters from the example initially

In [None]:
example = re.sub('\[[^]]*\]', ' ', example)
example = re.sub('[^a-zA-Z]',' ',example)  # replaces non-alphabets with spaces
example = example.lower() # Converting from uppercase to lowercase
example

### Removal of stopwords from the example initially

In [None]:
nltk.download("stopwords")   
from nltk.corpus import stopwords
example = nltk.word_tokenize(example)
example = [ word for word in example if not word in set(stopwords.words("english"))]

### Lematizing the example

In [None]:
lemma = nltk.WordNetLemmatizer()
example = [ lemma.lemmatize(word) for word in example] 

example = " ".join(example)
example

### Removal in all of the data

In [None]:
#Removal of HTML Contents
def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removal of Punctuation Marks
def remove_punctuations(text):
    return re.sub('\[[^]]*\]', '', text)

# Removal of Special Characters
def remove_characters(text):
    return re.sub("[^a-zA-Z]"," ",text)

#Removal of stopwords 
def remove_stopwords_and_lemmatization(text):
    final_text = []
    text = text.lower()
    text = nltk.word_tokenize(text)
    for word in text:
        if word not in set(stopwords.words('english')):
            lemma = nltk.WordNetLemmatizer()
            word = lemma.lemmatize(word) 
            final_text.append(word)
    return " ".join(final_text)

#Congregated removal function
def cleaning(text):
    text = remove_html(text)
    text = remove_punctuations(text)
    text = remove_characters(text)
    text = remove_stopwords_and_lemmatization(text)
    return text

#Apply function on text column
df['text'] = df['text'].apply(cleaning)

In [None]:
df.head()

In [None]:
df['label'].value_counts().plot.barh(title= 'Frequency of true and fake news')

### Number of words in fake and true news

In [None]:
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(12,8))
text_len = df[df['label'] == 1]['text'].str.split().map(lambda x: len(x))
ax1.hist(text_len, color = 'Green')
ax1.set_title('True news')
text_len = df[df['label'] == 0]['text'].str.split().map(lambda x: len(x))
ax2.hist(text_len, color = 'Red')
ax2.set_title('Fake news')
plt.show()

### Wordcloud of fake and true news

In [None]:
from wordcloud import WordCloud,STOPWORDS
print("TRUE NEWS")
plt.figure(figsize = (15,15))
wc = WordCloud(max_words = 100 , width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(df[df.label == 1].text))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
from wordcloud import WordCloud,STOPWORDS
print("FAKE NEWS")
plt.figure(figsize = (15,15))
wc = WordCloud(max_words = 100 , width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(df[df.label == 0].text))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], random_state=0)

In [None]:
max_features = 10000
maxlen = 300 #max number of words allowed per news

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [None]:
batch_size = 256
epochs = 10
embed_size = 100

In [None]:
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(max_features, output_dim=embed_size, input_length=maxlen, trainable=False))
#LSTM 
model.add(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
model.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train, validation_split=0.3, epochs=10, batch_size=batch_size, shuffle=True, verbose = 1)

In [None]:
print("Accuracy of LSTM on Training Data is - " , model.evaluate(X_train,y_train)[1]*100 , "%")
print("Accuracy of LSTM on Testing Data is - " , model.evaluate(X_test,y_test)[1]*100 , "%")

In [None]:
plt.figure()
plt.plot(history.history["accuracy"], label = "Train")
plt.plot(history.history["val_accuracy"], label = "Test")
plt.title("Accuracy")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["loss"], label = "Train")
plt.plot(history.history["val_loss"], label = "Test")
plt.title("Loss")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
pred = model.predict_classes(X_test)
print(classification_report(y_test, pred, target_names = ['Fake','True']))

In [None]:
inputs = keras.Input(shape=(None,), dtype="int32")
model = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
model = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(model)
model = layers.Bidirectional(layers.LSTM(64))(model)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(model)
models = keras.Model(inputs, outputs)
models.summary()

In [None]:
y_pred = models.predict(X_test)

from keras import backend as K

def recall_m(y_test, y_pred):
    true_positives = K.sum(K.round(K.clip(y_test * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_test, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_test, y_pred):
    true_positives = K.sum(K.round(K.clip(y_test * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_test, y_pred):
    precision = precision_m(y_test, y_pred)
    recall = recall_m(y_test, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
models.compile("adam", "binary_crossentropy", metrics=["accuracy", f1_m, precision_m, recall_m])
history = models.fit(X_train, y_train, validation_split=0.3, batch_size=batch_size, epochs=10, shuffle=True)

In [None]:
print("Accuracy of the BiLSTM on Training Data is - " , models.evaluate(X_train,y_train)[1]*100 , "%")
print("Accuracy of the BiLSTM on Testing Data is - " , models.evaluate(X_test,y_test)[1]*100 , "%")

In [None]:
plt.figure()
plt.plot(history.history["accuracy"], label = "Train")
plt.plot(history.history["val_accuracy"], label = "Test")
plt.title("Accuracy")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["loss"], label = "Train")
plt.plot(history.history["val_loss"], label = "Test")
plt.title("Loss")
plt.ylabel("Acc")
plt.xlabel("epochs")
plt.legend()
plt.show()

In [None]:
loss, accuracy, f1_score, precision, recall = models.evaluate(X_test, y_test, verbose=0)
print("Loss: ", loss)
print("Accuracy: ", accuracy)