<a href="https://colab.research.google.com/github/lokesharma-dev/Fake-News-Detection/blob/master/08_07_20_VAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import random
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Pre-processing:  To be replaced; this is just a sample version from Github

In [None]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

If Input is original text file here; but has worked for all npy tokens.

In [None]:
path = '/content/drive/My Drive/Colab Notebooks/Imdb/x_train.txt'
with open(path, 'r') as file:
  x_train = file.readlines()

df = pd.DataFrame(x_train, columns=['Subject'])
print('Before cleaning',df.head())

df['Subject'] = df['Subject'].map(lambda x: clean_text(x))
print('After cleaning', df.head())

Before cleaning                                              Subject
0  Working with one of the best Shakespeare sourc...
1  Well...tremors I, the original started off in ...
2  Ouch! This one was a bit painful to sit throug...
3  I've seen some crappy movies in my life, but t...
4  "Carriers" follows the exploits of two guys an...
After cleaning                                              Subject
0  work one best shakespear sourc film manag cred...
1  well tremor origin start 1990 found movi quit ...
2  ouch ! one bit pain sit through cute amus prem...
3  i have seen crappi movi life one must among wo...
4  carrier follow exploit two guy two gal stolen ...


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Subject'])
train_sequences = tokenizer.texts_to_sequences(df['Subject'])
x_train = pad_sequences(train_sequences, maxlen=300, padding='post')
type(x_train), len(x_train)
vocab = tokenizer.word_index
print('Length of Vocabulary: ',len(vocab))

path = '/content/drive/My Drive/Colab Notebooks/Imdb/y_train.npy'
y_train = np.load(path).astype('int32')
print('Labels:', type(y_train), len(y_train))


inds = np.arange(x_train.shape[0])
random.Random(1).shuffle(inds)
data = x_train[inds]
labels = y_train[inds]

num_test_samples = int(0.2 * data.shape[0])
print('Split ratio {}/{}:'.format(num_test_samples, data.shape[0]))
x_train = data[:-num_test_samples]
y_train = labels[:-num_test_samples]
x_test = data[-num_test_samples:]
y_test = labels[-num_test_samples:]
print("Training size:", x_train.shape, y_train.shape)
print("Testing size:", x_test.shape, y_test.shape)

BATCH_SIZE = 1024
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(len(x_train), seed=1, reshuffle_each_iteration=True)
train_dataset = train_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.shuffle(len(x_test), seed=1, reshuffle_each_iteration=True)
test_dataset = test_dataset.batch(BATCH_SIZE)

Length of Vocabulary:  52207
Labels: <class 'numpy.ndarray'> 24999
Split ratio 4999/24999:
Training size: (20000, 300) (20000,)
Testing size: (4999, 300) (4999,)


Create an embedding matrix

In [None]:
EMBEDDING_FILE = '/content/drive/My Drive/Colab Notebooks/Imdb/glove.6B.50d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

embedding_matrix = np.zeros((len(vocab) + 1, 50))
invalid = 0
for word, index in vocab.items():
  if index > len(vocab) - 1:
    break
  else:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[index] = embedding_vector
    else:
      embedding_matrix[index] = np.random.uniform(low=0.01, high=0.05, size=50)
      invalid += 1
print('Words not found in glove: ', invalid)


Words not found in glove:  20951


EagerTensor VAT Model

In [None]:
def compute_kld(p_logit, q_logit):
  p = tf.nn.sigmoid(p_logit)
  q = tf.nn.sigmoid(q_logit)
  kl_score = tf.reduce_sum( p * (tf.math.log(p+1e-16) - tf.math.log(q+1e-16)), axis = 1)
  return kl_score

def createEmbedding(features):
  seq = Input(shape=(300,))
  emb = Embedding(input_dim=len(vocab)+1,
                  output_dim = 50,
                  weights = [embedding_matrix],
                  trainable=False)(seq)
  emb_model = Model(seq, emb)
  clean_emb = emb_model(features)
  return clean_emb

def createModel(embedding_features):
  emb_tensor = Input(shape=(300,50,))
  hidden = LSTM(units=128)(emb_tensor)
  output = Dense(units=32, activation='relu')(hidden)
  model = Model(inputs=emb_tensor, outputs=output)
  logits = model(embedding_features)
  return emb_tensor, output, logits

def calculateGradient(clean_features, noised_features):
  with tf.GradientTape(watch_accessed_variables=False) as tape:
    tape.watch(noised_features)
    _, _, p_logit = createModel(clean_features)
    _, _, p_logit_r = createModel(noised_features)
    kl_score = compute_kld(p_logit, p_logit_r)
    # print('KL score:', kl_score)
  grads = tape.gradient(kl_score, noised_features)
  return grads

def custom_loss(vat_loss):
  def loss(true, pred):
    b_loss = binary_crossentropy(true, pred)
    # print('Vat_loss:', type(vat_loss), vat_loss)
    # print('B_loss:', type(b_loss), b_loss)
    net_loss = tf.math.add(vat_loss, b_loss)
    # net_loss = vat_loss
    return net_loss
  return loss


In [None]:
epsilon = 0.01
features, labels = next(iter(train_dataset))

clean_features = createEmbedding(features)
noised_features = tf.add(clean_features, epsilon)
# noised_features = tf.add(noised_features, clean_features)
print('Clean Embedding: ', type(clean_features), clean_features.shape) 
print('Noised Embedding: ', type(noised_features), noised_features.shape)

clean_ip_tensor, clean_op_tensor, p_logit = createModel(clean_features)
noise_ip_tensor, noise_op_tensor, p_logit_r = createModel(noised_features)
print('P_Logit: ',type(p_logit), p_logit.shape) 
print('P_Logit_R: ',type(p_logit_r), p_logit_r.shape) 

Clean Embedding:  <class 'tensorflow.python.framework.ops.EagerTensor'> (1024, 300, 50)
Noised Embedding:  <class 'tensorflow.python.framework.ops.EagerTensor'> (1024, 300, 50)
P_Logit:  <class 'tensorflow.python.framework.ops.EagerTensor'> (1024, 32)
P_Logit_R:  <class 'tensorflow.python.framework.ops.EagerTensor'> (1024, 32)


Calculate Adversary

In [None]:
grads = calculateGradient(clean_features, noised_features)
norm_ball = tf.math.l2_normalize(grads, axis=None, epsilon=1e-12, name=None)
rvadv = (grads/norm_ball) * -1 # As per the paper Miyato et al
vadv_features = tf.add(clean_features, rvadv)
vat_ip_tensor, vat_op_tensor, q_logit = createModel(vadv_features)

print('Adversarial Embedding: ', type(vadv_features), vadv_features.shape) 
print('Q_logit: ',type(q_logit), q_logit.shape)

Adversarial Embedding:  <class 'tensorflow.python.framework.ops.EagerTensor'> (1024, 300, 50)
Q_logit:  <class 'tensorflow.python.framework.ops.EagerTensor'> (1024, 32)


Build Model

In [None]:
p = Dense(units=1, activation='softmax')(clean_op_tensor) # Tensor
print('Prediction: ', type(p), p)
model = Model(inputs=clean_ip_tensor, outputs=p)
model.summary()

vat_loss = compute_kld(p_logit, p_logit_r)
model.compile(optimizer='Adam', loss= custom_loss(vat_loss), metrics=['accuracy'])

print('VAT Loss : ', type(vat_loss), vat_loss)
model.fit(clean_features, labels, batch_size=1024)

Prediction:  <class 'tensorflow.python.framework.ops.Tensor'> Tensor("dense_5/Identity:0", shape=(None, 1), dtype=float32)
Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 300, 50)]         0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               91648     
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 95,809
Trainable params: 95,809
Non-trainable params: 0
_________________________________________________________________
VAT Loss :  <class 'tensorflow.python.framework.ops.EagerTensor'> tf.Tensor(
[-0.02090424 -0.02090424 -0.02090424 ... -0.02090424 

<tensorflow.python.keras.callbacks.History at 0x7f97d29e9908>

In [None]:
N_Epochs = 1
epsilon = 0.1

for epoch in range(N_Epochs):
  print('Epoch No {} ------'.format(epoch+1))
  batch_no = 1
  for features, labels in train_dataset:
    print('Batch No: ', batch_no)
    batch_no += 1
    clean_emb = createEmbedding(features)
    noised_emb = tf.add(clean_emb, epsilon)

    clean_ip_tensor, clean_op_tensor, p_logit = createModel(clean_emb)
    _, _, p_logit_r = createModel(noised_emb)

    grads = calculateGradient(clean_emb, noised_emb)
    norm_ball = tf.stop_gradient(grads)
    norm_ball = tf.math.l2_normalize(norm_ball, axis=None, epsilon=1e-12, name=None)
    rvadv = (grads/norm_ball) * -1 # As per the paper Miyato et al
    vadv_features = tf.add(clean_emb, rvadv)
    vat_ip_tensor, vat_op_tensor, q_logit = createModel(vadv_features)
    p = Dense(units=1, activation='softmax')(clean_op_tensor) # Tensor
    model = Model(inputs=clean_ip_tensor, outputs=p)
    p_logit_no_gd = tf.stop_gradient(p_logit)
    vat_loss = compute_kld(p_logit_no_gd, q_logit)
    model.compile(optimizer='Adam', loss = custom_loss(vat_loss), metrics=['accuracy'])
    model.fit(clean_emb, labels, batch_size=1024)

Epoch No 1 ------
Batch No:  1
Batch No:  2
Batch No:  3
Batch No:  4
Batch No:  5
Batch No:  6
Batch No:  7
Batch No:  8
Batch No:  9
Batch No:  10
Batch No:  11
Batch No:  12
Batch No:  13
Batch No:  14
Batch No:  15
Batch No:  16
Batch No:  17
Batch No:  18
Batch No:  19
Batch No:  20


Let's not consider the performance at this stage, as Model architecture is an abstract and this code-block only represents VAT. 

To Dos: 

Once Mean-Teacher's code is ready, and works for unlabelled examples. We move on to the final integration part.