In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import re
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
import nltk
import tensorflow.compat.v1 as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn
from tensorflow.keras.layers import TimeDistributed as TD
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn
from tensorflow.keras.layers import GRUCell
nltk.download('punkt')
tf.disable_v2_behavior()

In [None]:
maxlen = 80
max_sentences = 15 # max sentences per review
max_words = 20000 # most common words (vocbaulary)
embedding_dim = 100
validation_split = 0.2
hidden_size=150 
attention_size = 50

In [None]:
def clean_text(text):
  '''
  Remove non-ascii characters, multiple spaces, and newlines
  '''
  text = re.sub(r'[^\x00-\x7f]', r'', text)
  text = re.sub(r'\n',' ', text)
  text = re.sub(r" +"," ",text)
  return text

In [None]:
X = np.load("drive/My Drive/TDL/Data/X.npy")
y = np.load("drive/My Drive/TDL/Data/y.npy")

In [None]:
data = pd.read_csv('drive/My Drive/TDL/Data/train.csv')
X = []
y = []
reviews = []
for index, row in data.iterrows():
  cleaned = clean_text(row[1]) # per review
  X.append(cleaned)
  y.append(list(row[2::]))  # (6,1)

X = np.array(X)
y = np.array(y)

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X) # X = ['hello' , 'hi', 'how', 'are']
X = pad_sequences(X, maxlen=maxlen)

In [None]:
np.save("drive/My Drive/TDL/Data/X.npy", X)
np.save("drive/My Drive/TDL/Data/y.npy", y)

In [None]:
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
labels = y[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = X[:-nb_validation_samples]
y_train = y[:-nb_validation_samples]
x_val = X[-nb_validation_samples:]
y_val = y[-nb_validation_samples:]

del X
del labels
gc.collect()

In [None]:
embeddings_index = {}
f = open('drive/My Drive/TDL/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

word_index = tokenizer.word_index
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

del embeddings_index
gc.collect()

In [None]:
def attention(inputs, att_size):
    """
    Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.
    """
    inputs = tf.concat(inputs, 2) # (n, embedding_size, hiddensize) -> (n, 80, 300)
    hiddensize = inputs.shape[2].value  # D value - hidden size of the RNN layer -> 300
    
    # Trainable parameters
    W = tf.Variable(tf.random_normal([hiddensize, att_size], stddev=0.1), trainable=True) # (hiddensize, attsize) -> (300, 50)
    b = tf.Variable(tf.random_normal([att_size], stddev=0.1), trainable=True) # (1, 50)
    u = tf.Variable(tf.random_normal([att_size], stddev=0.1), trainable=True) # (1, 50)
    v = tf.tanh(tf.tensordot(inputs, W, axes=1) + b)  # v = tanh(W.x + b) . (1,attsize) -> (n, 80, 50)
    vu = tf.tensordot(v, u, axes=1, name='vu')  # v.u (n,80)
    scores = tf.nn.softmax(vu, name='alphas')   # attention scores (n,80)

    output = tf.reduce_sum(inputs * tf.expand_dims(scores, -1), 1)
    return output, W

In [None]:
tf.reset_default_graph()

# NEW NETWORK! EDIT THIS
input_x = tf.placeholder(tf.int32, [None, maxlen])
output_x = tf.placeholder(tf.float32, [None,6])
embeddings_var = tf.Variable(embedding_matrix, trainable=True)
embeddings = tf.nn.embedding_lookup(embeddings_var, input_x, partition_strategy='div')
embed = tf.cast(embeddings,tf.float32)  # n*80*100

rnn_outputs, _ = bidirectional_dynamic_rnn(GRUCell(hidden_size, dtype=tf.float32), GRUCell(hidden_size, dtype=tf.float32), inputs=embed, dtype=tf.float32)
weighted_inputs, weights = attention(rnn_outputs,attention_size)

fc = tf.keras.layers.Dense(units=6, activation='sigmoid')(weighted_inputs)

output_x = tf.cast(output_x,tf.float32) 
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=fc, labels=output_x)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(loss)

pred = tf.round(fc)
n_correct = tf.equal(pred, output_x)
accuracy = tf.reduce_mean(tf.cast(n_correct, tf.float32))


In [None]:
with tf.Session() as sess:
  tf.initialize_all_variables().run()
  epochs = 10
  for epoch in range(epochs):
    acc_sum = 0
    n_batches = 0 

    print("epoch: ",epoch+1)
    for i in range(0,x_train.shape[0], 256):
      try:
        x = x_train[i:i+256]
        y = y_train[i:i+256]
      except:
        x = x_train[i::]
        y = y_train[i::]
      opt, acc = sess.run([optimizer,accuracy], feed_dict={input_x:x, output_x:y})
      acc_sum += acc
      n_batches += 1
    print(acc_sum, n_batches)
    print('Accuracy: ', acc_sum/n_batches)

In [None]:
with tf.Session() as sess:
  tf.initialize_all_variables().run()
  emb = sess.run(embeddings, feed_dict={input_x:x_train[0:2], output_x:y_train[0:2]})