In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import re
import gc
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
import nltk
import tensorflow.compat.v1 as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, GRU
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn
from tensorflow.keras.layers import GRUCell
from keras.engine.topology import Layer
from keras.models import Model
nltk.download('punkt')
tf.disable_v2_behavior()

In [None]:
maxlen = 80
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
hidden_size=150 
attention_size = 50

In [None]:
def clean_text(text):
  '''
  Remove non-ascii characters, multiple spaces, and newlines
  '''
  text = re.sub(r'[^\x00-\x7f]', r'', text)
  text = re.sub(r'\n',' ', text)
  text = re.sub(r" +"," ",text)
  return text

Clean the text and generate numpy arrays for instances and their labels

In [None]:
data = pd.read_csv('drive/My Drive/TDL/Data/train.csv') # path to dataset
X = []
y = []
reviews = []
for index, row in data.iterrows():
  cleaned = clean_text(row[1])
  X.append(cleaned)
  y.append(list(row[2::]))  

X = np.array(X)
y = np.array(y)

Generate a train and test set from the input dataset

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

data = np.zeros((X.shape[0], max_sentences, maxlen), dtype='int32')
for i, review in enumerate(reviews):
    for j, sentence in enumerate(review):
        if j < max_sentences:
            tokens = text_to_word_sequence(sentence)
            k = 0
            for _, word in enumerate(tokens):
                if k < maxlen and tokenizer.word_index[word] < max_words:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = y[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = y[-nb_validation_samples:]

In [None]:
del X
del labels
gc.collect()

Compute word embeddings for all words in the vocabulary

In [None]:
embeddings_index = {}
f = open('drive/My Drive/TDL/glove.6B/glove.6B.100d.txt') # path to pre trained embeddings
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

word_index = tokenizer.word_index
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

del embeddings_index
gc.collect()

Define the word and sentence level attention layers

In [None]:
def word_attention(inputs, att_size):
    inputs = tf.concat(inputs, 2) # (n, embedding_size, hiddensize) -> (n, 80, 300)
    hiddensize = inputs.shape[2].value  # D value - hidden size of the RNN layer -> 300
    
    # Trainable parameters
    W_word = tf.Variable(tf.random_normal([hiddensize, att_size], stddev=0.1)) # (hiddensize, attsize) -> (300, 50)
    b_word = tf.Variable(tf.random_normal([att_size], stddev=0.1)) # (1, 50)
    u_word = tf.Variable(tf.random_normal([att_size], stddev=0.1)) # (1, 50)
    v_word = tf.tanh(tf.tensordot(inputs, W_word, axes=1) + b_word)  # v = tanh(W.x + b) . (1,attsize) -> (n, 80, 50)
    vu_word = tf.tensordot(v_word, u_word, axes=1, name='vu')  # v.u (n,80)
    scores = tf.nn.softmax(vu_word)   # attention scores (n,80)

    output = tf.reduce_sum(inputs * tf.expand_dims(scores, -1), 1)
    return output

def sentence_attention(inputs, att_size):
    inputs = tf.concat(inputs, 2) # (n, embedding_size, hiddensize) -> (n, 80, 300)
    hiddensize = inputs.shape[2].value  # D value - hidden size of the RNN layer -> 300
    
    # Trainable parameters
    W_sent = tf.Variable(tf.random_normal([hiddensize, att_size], stddev=0.1)) # (hiddensize, attsize) -> (300, 50)
    b_sent = tf.Variable(tf.random_normal([att_size], stddev=0.1)) # (1, 50)
    u_sent = tf.Variable(tf.random_normal([att_size], stddev=0.1)) # (1, 50)
    v_sent = tf.tanh(tf.tensordot(inputs, W_sent, axes=1) + b_sent)  # v = tanh(W.x + b) . (1,attsize) -> (n, 80, 50)
    vu_sent = tf.tensordot(v_sent, u_sent, axes=1, name='vu')  # v.u (n,80)
    scores = tf.nn.softmax(vu_sent)   # attention scores (n,80)

    output = tf.reduce_sum(inputs * tf.expand_dims(scores, -1), 1)
    return output

Define the Hierarchical Attention Network 

In [None]:
tf.reset_default_graph()

sentence_input = tf.placeholder(tf.int32, [None,max_sentences, maxlen])
input_len = tf.placeholder(tf.int32)
output_x = tf.placeholder(tf.int32, [None,6])
output_x = tf.cast(output_x,tf.float32) 

with tf.variable_scope("word") as scope:
  mat = []
  for word in tf.unstack(sentence_input,axis=1):
    embeddings_var = tf.Variable(embedding_matrix, trainable=True)
    embeddings = tf.nn.embedding_lookup(embeddings_var, word, partition_strategy='div')
    embeddings = tf.cast(embeddings,tf.float32) 
    rnn_outputs, _ = bidirectional_dynamic_rnn(GRUCell(hidden_size, dtype=tf.float32), GRUCell(hidden_size, dtype=tf.float32), inputs=embeddings, dtype=tf.float32)
    weighted_inputs = word_attention(rnn_outputs,attention_size)
    weighted_inputs = tf.reshape(weighted_inputs, [input_len, 1, weighted_inputs.shape[1]])
    scope.reuse_variables()    
    mat.append(weighted_inputs)

after_word_attention = tf.stack(mat, axis=1)
after_word_attention = tf.reshape(after_word_attention, [input_len, 15, 300])
rnn_outputs_sent, _ = bidirectional_dynamic_rnn(GRUCell(round(hidden_size*1.5), dtype=tf.float32), GRUCell(round(hidden_size*1.5), dtype=tf.float32), inputs=after_word_attention, dtype=tf.float32)
weighted_inputs_sent = sentence_attention(rnn_outputs_sent,attention_size)

fc = tf.keras.layers.Dense(units=6, activation='sigmoid')(weighted_inputs_sent)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=fc, labels=output_x)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(loss)

pred = tf.round(fc)
n_correct = tf.equal(pred, output_x)
accuracy = tf.reduce_mean(tf.cast(n_correct, tf.float32))


In [None]:
with tf.Session() as sess:
  tf.initialize_all_variables().run()

  # Training 
  epochs = 3
  for epoch in range(epochs):
    acc_sum = 0
    n_batches = 0 
    print("\nEpoch: ",epoch+1)
    for i in range(0,20000, 2000):
      try:
        x = x_train[i:i+200]
        y = y_train[i:i+200]
      except:
        x = x_train[i::]
        y = y_train[i::]
      opt, acc = sess.run([optimizer,accuracy], feed_dict={sentence_input:x, output_x:y, input_len:len(y)})
      acc_sum += acc
      n_batches += 1
      print(acc)
    print('==> Train Accuracy: ', acc_sum/n_batches)
  # Validation
  val_accs=[]
  for i in range(10):
    val_accs.append(sess.run(accuracy, feed_dict={sentence_input:x_val[i*1000:(i+1)*1000], output_x:y_val[i*1000:(i+1)*1000], input_len:len(y_val[i*1000:(i+1)*1000])}))


In [None]:
np.mean(val_accs)

In [None]:
with tf.Session() as sess:
  tf.initialize_all_variables().run()
  for i in range(10):
    s = sess.run([embeddings, rnn_outputs, weighted_inputs, after_word_attention, rnn_outputs_sent, weighted_inputs_sent, fc, loss, accuracy, optimizer ], feed_dict={sentence_input:x_train[0:2], output_x:y_train[0:2]})