# Importing the required Libraries

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tensorflow import expand_dims
from tensorflow.math import reduce_sum
from tensorflow.nn import tanh, softmax
from tensorflow.keras.layers import Dense, Embedding, Conv1D, MaxPool1D, Input, LSTM, Bidirectional, Layer,Dot, Multiply, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
# from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.dtypes import uint8, float32
import pickle

In [None]:
!pip install -q keras-tcn
from tcn import TCN

In [None]:
from tensorflow.random import set_seed
set_seed(5)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data Loading and Preprocessing

In [None]:
true = pd.read_csv('/content/drive/MyDrive/data_set_1/ISOT Fake News Dataset/True.csv')
fake = pd.read_csv('/content/drive/MyDrive/data_set_1/ISOT Fake News Dataset/Fake.csv')

In [None]:
# add 1 for label for true and 0 fro fake
true["label"] = 1
fake['label'] = 0

In [None]:
# Combine both dataframes and shuffle
input_data = pd.concat( [true,fake] )
input_data = input_data.sample(frac = 1)

In [None]:
# remove website url and ip
input_data['text']= input_data['text'].apply(lambda x: re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", "", x))
input_data['text']= input_data['text'].apply(lambda x: re.sub(r"^(?!mailto:)(?:(?:http|https|ftp)://)(?:\\S+(?::\\S*)?@)?(?:(?:(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}(?:\\.(?:[0-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))|localhost)(?::\\d{2,5})?(?:(/|\\?|#)[^\\s]*)?$", "", x))
input_data['text']= input_data['text'].apply(lambda x: re.sub(r"^((25[0-5]|(2[0-4]|1[0-9]|[1-9]|)[0-9])(\.(?!$)|$)){4}$", "", x))

In [None]:
# Remove Stopwords
import nltk
nltk.download('stopwords')
stopwords=stopwords.words('english')
input_data['text'] = input_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

In [None]:
#STEMMING
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd
nltk.download('punkt')

In [None]:
porter = PorterStemmer()
# for word in input_data['text']:
#     print(porter.stem(word))
input_data['text'] = input_data['text'].apply(lambda x: ' '.join([porter.stem(y) for y in x.split()]))

Mapping Text to Vectors

In [None]:
pip install keras-preprocessing

In [None]:
# Tockenization
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding

In [None]:
tokenizer = Tokenizer(num_words=9999999999)
tokenizer.fit_on_texts(input_data['text'])
sequences = tokenizer.texts_to_sequences(input_data['text'])
word_index = tokenizer.word_index

In [None]:
len(sequences)

In [None]:
import tensorflow as tf
sequences=tf.keras.preprocessing.sequence.pad_sequences(
    sequences,
    maxlen=100,
    dtype='int32',
    padding='post',
    truncating='pre',
    value=0.0
)

In [None]:
sequences

In [None]:
GLOVE_DIR = "data"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, '/content/drive/MyDrive/data_set_1/ISOT Fake News Dataset/glove.6B.300d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    #print(values[1:])
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=300)

# SPLITTING THE DATA


In [None]:
data=sequences
label= input_data["label"]
x_train, x_test, y_train, y_test = train_test_split( data, label, test_size=0.20, random_state=42)
x_test, x_val, y_test, y_val = train_test_split( x_test, y_test, test_size=0.50, random_state=42)
print('Size of train, validation, test:', len(y_train), len(y_val), len(y_test))

print('real & fake news in train,valt,test:')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))
print(y_test.sum(axis=0))

# MODEL

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/Project_Models/Processed_Data/'
path1 = '/content/drive/MyDrive/Colab Notebooks/Project_Models/'
x_train=pickle.load(open(path+'x_train.pkl', 'rb'))
y_train=pickle.load(open(path+'y_train.pkl', 'rb'))
y_test=pickle.load(open(path+'y_test.pkl', 'rb'))
x_test=pickle.load(open(path+'x_test.pkl', 'rb'))
x_val=pickle.load(open(path+'x_val.pkl', 'rb'))
y_val=pickle.load(open(path+'y_val.pkl', 'rb'))
embedding_layer = pickle.load(open(path1+'i100embedding_layer.pkl', 'rb'))

In [None]:
class Attention(Layer):

    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences
        super(Attention,self).__init__()

    def build(self, input_shape):

        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="zeros")

        super(Attention,self).build(input_shape)

    def call(self, x):

        w = expand_dims(self.W, 0)
        e = tanh(Dot(axes = [2, 1])([x,w])+self.b)
        a = softmax(e, axis=1)
        output = x*a

        if self.return_sequences:
            return output

        return reduce_sum(output, axis=1)

In [None]:
i = Input([100], dtype=uint8)
x = embedding_layer(i)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPool1D()(x)
# x = LSTM(32, activation='linear')(x)
# x = (Bidirectional(LSTM(32, activation='linear')(x))
# prediction = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[i], outputs=prediction)

# x = Embedding(10000 + 1,
#                             300,
#                             # weights=[embedding_matrix],
#                             input_length=300)(i)

max_len = 200
rnn_cell_size = 128
vocab_size = 250

x = Bidirectional(LSTM(rnn_cell_size,
                        return_sequences=True), name="bi_lstm_0")(x)
x= Dropout(0.30)(x)
x = Attention(return_sequences=True)(x)
# x = LSTM(128, activation='linear', return_sequences=True)(x)
x = TCN(return_sequences=False)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=i , outputs=output)
model.compile(optimizer=Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# lstm, forward_h, forward_c, backward_h, backward_c =Bidirectional \
#     (LSTM(rnn_cell_size,
#       return_sequences=True))(x)

In [None]:
model.summary()

In [None]:
class myCallback(Callback):
  def on_epoch_end(self, epoch, logs={}):
    acc = logs.get('accuracy')
    val_acc = logs.get('val_accuracy')
    if (epoch % 5 == 0) and (epoch != 0):
      model_name = f'BLSTM-ATT-TCN_e{epoch}'  # add model name (name_) as required
      model_path = '/content/drive/MyDrive/Colab Notebooks/Project_Models/BLSTM_ATT_TCN_Models'  # add model path as required
      self.model.save(os.path.join(model_path, model_name))

callback = myCallback()

callbacks=[callback]

In [None]:
BATCH_SIZE = 64
EPOCHS = 40
TRAINING_STEPS = len(x_train) //  BATCH_SIZE
VALIDATION_STEPS = len(x_val) // BATCH_SIZE

history = model.fit(x_train,y_train,
                    steps_per_epoch= TRAINING_STEPS,
                    validation_data=[x_val,y_val],
                    validation_steps=VALIDATION_STEPS,
                    epochs=EPOCHS,
                    callbacks=[callback],
                    verbose='auto')

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/Project_Models/Processed_Data/'
pickle.dump(history, open(path+'history_BLSTM_ATT_TCN.pkl', 'wb'))

# Metrics and Graphs


In [None]:
# Training History
print(history.history.keys())
# summarize history for accuracy

plt.plot(history.history['val_accuracy'], )
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
model=load_model('/content/drive/MyDrive/Colab Notebooks/Project_Models/BLSTM_ATT_TCN_Models/BLSTM-ATT-TCN_e35')

In [None]:
y_pred=model.predict(x_test)
y_pred = np.squeeze(y_pred)
y_pred

In [None]:
p = lambda t : 1 if t>=0.5 else 0
y_pred=np.vectorize(p)(y_pred)

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))

In [None]:
model.evaluate(x_test, y_test, verbose = 0)

In [None]:
dot_img_file = '/tmp/model_1.png'
tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

In [None]:
accuracy=[]
val_accuracy=[]
loss=[]
val_loss=[]

In [None]:
final=['history_CNN_LSTM.pkl', 'history_CNN_BLSTM_ATT.pkl', 'history_BLSTM_ATT_TCN.pkl']

In [None]:
from nltk.tag.hunpos import find_binary
for x in final:
              history=pickle.load(open(path1+x, 'rb'))
              accuracy.append(history.history['accuracy'])
              val_accuracy.append(history.history['val_accuracy'])
              loss.append(history.history['loss'])
              val_loss.append(history.history['val_loss'])

In [None]:
print (*accuracy)

In [None]:
for x in accuracy:
                  plt.plot(x)
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Model 1', 'Model 2','Model 3'], loc='lower right')
f = plt.figure()
plt.show()

In [None]:
for x in val_accuracy:
                  plt.plot(x)
plt.ylabel('val_accuracy')
plt.xlabel('epoch')
plt.legend(['Model1', 'Model2','Model3','Model4'], loc='lower rightt')
plt.show()

In [None]:
for x in loss:
                  plt.plot(x)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Model1', 'Model2','Model3','Model4'], loc='upper right')
plt.show()

In [None]:
for x in val_loss:
                  plt.plot(x)
plt.ylabel('val_loss')
plt.xlabel('epoch')
plt.legend(['Model1', 'Model2','Model3','Model4'], loc='upper right')
plt.show()