<a href="https://colab.research.google.com/github/lokesharma-dev/Fake-News-Detection/blob/master/VAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Import Libararies**

In [0]:
import numpy as np
import re
import random
import time
#------------------- Text preprocessing
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#------------------- Tensorflow
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Bidirectional

## **spaCy Preprocessing (Lokesh)**

### Class Spacy

In [0]:
class Spacy(object):

    def __init__(self):
        # python -m spacy download en_core_web_lg
        self.nlp = spacy.load("en_core_web_lg")
        pass

    def deNoise(self, text):
        text = re.sub(r'[“”""]', '', text) # removes quotes
        text = text.replace("'s", '')
        text = re.sub(r'[-]', ' ', text) # helps in splitting doc into sentences
        text = re.sub(r'http[\w:/\.]+', '', text) # removing urls
        text = re.sub(r'[^\.\w\s]', '', text) # removing everything but characters and punctuation
        text = re.sub(r'\.', '.', text) # replace periods with a single one
        text = re.sub(r'\n', ' ', text) # removing line break
        text = re.sub(r'[^\w\s]', '', text.lower())
        text = re.sub(r'\s\s+', ' ', text)  # replace multiple whitespace with one
        return text

    def stopWords(self, text):
        tokens = ""
        spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
        text = text.split(" ")
        for word in text:
            if word not in spacy_stopwords:
                tokens = tokens + " " + word
        return tokens

    def lemmatize(self, tokens):
        lemma_token = ""
        tokens_object = self.nlp(tokens)
        # lemma_token = [token.lemma_ for token in tokens_object]
        # lemma_token = ''.join(lemma_token) # converts list to string
        for token in tokens_object:
            lemma_token = lemma_token + " " + token.lemma_
        lemma_token = re.sub(r'\s\s+', ' ', lemma_token)  # replace multiple whitespace with one
        lemma_token = lemma_token.strip() # removes trailing whitespaces
        return  lemma_token

    def set_custom_boundaries(self, doc):
        for token in doc[:-1]:
            if token.text == '--':
                doc[token.i+1].is_sent_start = True
        return doc

    def sentence_detect(self, text):
        self.nlp.add_pipe(self.set_custom_boundaries, before='parser')
        doc = self.nlp(text)
        sentences = list(doc.sents)
        for sentence in sentences:
            print(sentence)

    def tokenize(self, text):
        doc = self.nlp(text)
        print([token.text for token in doc])

    def orchestrate(self, text):
        return self.lemmatize(self.stopWords(self.deNoise(text)))

### Class Embedding

In [0]:
class Embedding(object):

    def __init__(self):
        # Parameters
        self.MAX_VOCAB_SIZE = 1000000  # maximum no of unique words
        self.MAX_DOC_LENGTH = 500  # maximum no of words in each sentence
        self.EMBEDDING_DIM = 300  # Embeddings dimension from Glove directory
        self.GLOVE_DIR = '/content/drive/My Drive/Colab Notebooks/glove/glove.6B/glove.6B.' + str(self.EMBEDDING_DIM) + 'd.txt'

    def tokenize_padding(self, docs):
        # Tokenize & pad sequences
        tokenizer = Tokenizer(num_words=self.MAX_VOCAB_SIZE, oov_token='-EOS-')
        tokenizer.fit_on_texts(docs)
        encoded_docs = tokenizer.texts_to_sequences(docs)
        word_index = tokenizer.word_index
        print('Vocabulary size :', len(word_index))
        sequences = pad_sequences(encoded_docs, padding='post', maxlen=self.MAX_DOC_LENGTH)
        return [word_index, sequences]

    def load_glove(self):
        embeddings_index = {}
        f = open(self.GLOVE_DIR, encoding='utf-8')
        print('Loading Glove from: ', self.GLOVE_DIR, '...', end='')
        for line in f:
            values = line.split()
            word = values[0]
            embeddings_index[word] = np.asarray(values[1:], dtype='float32')
        f.close()
        print('Found %s word vectors.' % len(embeddings_index))
        print('\nDone.\nProcedding with Embedded Matrix...', end='')
        return embeddings_index

    def embedding_matrix(self, word_index, embeddings_index):
        # Create an embedding matrix
        # first create a matrix of zeros, this is our embedding matrix
        embeddings_matrix = np.zeros((len(word_index) + 1, self.EMBEDDING_DIM))
        # embeddings_matrix = np.random.random(((20568),EMBEDDING_DIM))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embeddings_matrix[i] = embedding_vector
            else:
                # doesn't exist, assign a random vector
                embeddings_matrix[i] = np.random.random(self.EMBEDDING_DIM)
        print('\nCompleted')
        return embeddings_matrix

### __main__()

In [0]:
# Load dataset
df = pd.read_csv('GBVAT/data/processed_datasets/celebrityDataset.csv')

# Feature Engineering
df.nunique()
df.isna().sum()
df.Subject.fillna('', inplace=True)

x = df.Subject + " " + df.Content
y = pd.Series([0 if row == 'Fake' else 1 for row in df.Label])  # Series is 1D array but with same dtype

S = Spacy()
start = time.time()
docs = [S.orchestrate(row) for row in x]
end = time.time()
print("Cleaning the document took {} seconds".format(round(end - start)))

E = Embedding()
sequences = E.tokenize_padding(docs)
word_index = sequences[0]
sequences = sequences[1]
print('Shape of data tensor:', sequences.shape)
print('Shape of label tensor', y.shape)

embeddings_index = E.load_glove()




## **Load & Split Dataset**

In [0]:
# data = np.load('/content/data.npy',allow_pickle=True)
# label = np.load('/content/label.npy',allow_pickle=True)

# Shuffle data random before splitting
indices = np.arange(sequences.shape[0])
random.Random(1).shuffle(indices)
data = sequences[indices]
labels = y[indices]

num_test_samples = int(0.2 * data.shape[0])
x_train = data[:-num_test_samples]
y_train = labels[:-num_test_samples]
x_test = data[-num_test_samples:]
y_test = labels[-num_test_samples:]

## **VAT Model (Lokesh)**

### Configuration

In [0]:
MAX_VOCAB_SIZE = 1000000 # maximum no of unique words
MAX_DOC_LENGTH = 500 # maximum no of words in each sentence
EMBEDDING_DIM = 300 # Embeddings dimension from Glove directory

inputs = Input(shape=(MAX_DOC_LENGTH,)) # TensorShape([None, 200])

### Network Architecture

In [0]:
def embeddingLayer():
  network = Sequential()
  network.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM))
  p_logit = network(inputs) # TensorShape([None, 200, 128])
  return [network, p_logit]

### User defined functions

In [0]:
def compute_kld(p_logit, q_logit):
  p = tf.nn.softmax(p_logit)
  q = tf.nn.softmax(q_logit)
  # kl_score = tf.reduce_sum(tf.where(condition=(p==0), x=tf.zeros(p.shape, tf.float64),y = p * tf.log(p/q)))
  kl_score = tf.reduce_sum( p * (tf.math.log(p+1e-16) - tf.math.log(q+1e-16)), axis = 1)
  return kl_score # lower kl means closer the distributions are
# Plot p | q | kl
# make sure p is not zero

def make_unit_norm(x):
  x_norm = x/(tf.reshape(tf.sqrt(tf.reduce_sum(tf.pow(x,2), axis=1)), [-1,1]) + 1e-16)
  return x_norm

### Vat Loss

In [0]:
def vatLoss():
  returnList = embeddingLayer()
  network = returnList[0]
  p_logit = returnList[1]

  # Random noise to be substituted in future with AM
  r = tf.random.uniform(shape=tf.shape(inputs)) # TensorShape([None, 200])
  r = make_unit_norm(r)
  p_logit_r = network(inputs + 10*r) # Perturbations added just after Embedding layer 
  # TensorShape([None, 200, 128])

  # with tf.GradientTape() as tape:
  #   tape.watch(r)
  #   r = tf.reduce_sum(r)
  #   # p_logit_r = model_vat(inputs + 10*r)
  #   kl_score = compute_kld(p_logit, p_logit_r)
  # grads = tape.gradient(kl_score, r)
  # grads.shape

  kl_score = tf.reduce_mean(compute_kld(p_logit, p_logit_r)) # reduce_mean because kl scores have 128 dimensions from Embedding layer
  tf.compat.v1.disable_eager_execution() # Fix this for future with tape
  grads = tf.gradients(kl_score, r) # list
  kl_grads = [grad if grad is not None else tf.zeros_like(r)for r, grad in zip([r], grads)][0] # TensorShape([None, 200])

  # Adversarial perturbation
  r_vadv = tf.stop_gradient(kl_grads)
  r_vadv = make_unit_norm(r_vadv) # TensorShape([None, 200])
  r_vadv = inputs  + r_vadv

  # During GD don't train your logits
  p_logit_no_gradient = tf.stop_gradient(p_logit) # same dimention as p_logit # TensorShape([None, 200, 128])

  p_logit_r_adv = network(r_vadv) 
  
  # KLD(p_logit_no_gradient|p_logit_r_adv)
  vat_loss = tf.reduce_mean(compute_kld(p_logit_no_gradient, p_logit_r_adv)) # Scalar
  return vat_loss

### Variant 1

In [0]:
returnList = embeddingLayer()
p_logit = returnList[1]
layer1 = Bidirectional(LSTM(units=128))(p_logit)
layer2 = Dense(units=2, activation='sigmoid')(layer1)
output_layer = Dense(units=1)(layer2)
model = Model(inputs, output_layer)
vat_loss = vatLoss()
model.add_loss(vat_loss)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True )

### Cross-Validation

In [0]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),loss= 'binary_crossentropy',metrics=['accuracy'])
model.metrics_names.append('vat_loss')
model.metrics.append(vat_loss)
model.fit(x_train, y_train, epochs= 3, validation_split=0.2, shuffle= True, batch_size=32)

In [92]:
score, acc = model.evaluate(x_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 7.777111825942993
Test accuracy: 0.49
