<a href="https://colab.research.google.com/github/nafabrar/AES/blob/master/BERT_HAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Notebook for training a BERT-HAN model

This notebook can be used to train a HAN model with BERT sentence embedding as input. The input can be generated using BERT as service (https://github.com/hanxiao/bert-as-service). This notebook creates the BERT embedding from the TOEFL essay data after running TOEFL_dataParse.py 

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
%cd /content/gdrive/My Drive/AES/AES

/content/gdrive/My Drive/AES/AES


In [0]:
from __future__ import print_function, division

import os
import os.path
import pandas as pd
from io import StringIO
import io
import unicodedata
import re
import random

import tensorflow as tf
import numpy as np
np.set_printoptions(threshold = 10000)
import collections
import random

from tensorflow.contrib.rnn import LSTMCell as Cell #for GRU: custom implementation with normalization
from tensorflow.python.ops.rnn import dynamic_rnn as rnn
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tensorflow.contrib.rnn import DropoutWrapper

from attention import attention as attention
from ordloss import *
from utils import *
from dataUtils_mult import *


from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from sklearn.metrics import accuracy_score

In [0]:
SEQUENCE_LENGTH_D = 25
train_split = 0.9
BATCH_SIZE = 50

# system parameters
HIDDEN_SIZE_D = 150
ATTENTION_SIZE_D = 50
KEEP_PROB = 0.7
DELTA = 0.75
X_R = random.randint(1,10000)
b_len = 768

#read train and val data
fpath = 'data/TOEFL/'

In [0]:
SEQUENCE_LEN_D = SEQUENCE_LENGTH_D
df_train = pd.read_csv(os.path.join(fpath, 'train.csv'))

text = df_train['text1']
df_val = pd.read_csv(os.path.join(fpath,'test.csv'))

text_val = df_val['text1']
text_train = df_train['text1']
rank_val = df_val['label']
rank_train = df_train['label']

target_val = np.array(rank_val)
target_train = np.array(rank_train)

onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded = target_train.reshape(len(target_train), 1)
y_train = onehot_encoder.fit_transform(integer_encoded)

integer_encoded_val = target_val.reshape(len(target_val), 1)
y_test = onehot_encoder.fit_transform(integer_encoded_val)


### Get bert senence mebeddings for each sentence in an essay
## Based on BERT serving client https://pypi.org/project/bert-serving-client/
## base uncased model
## Pooling = NONE for token level representation
## bert-serving-start -model_dir /temp/uncased_L-12_H-768_A-12/ -num_worker=4 -max_seq_len=40
from bert_serving.client import BertClient
bc = BertClient()
#b_len = 768
X_train = []

for i in df_train['text1']:
    i = sent_tokenize(i)
    X_train.extend(bc.encode(i[:SEQUENCE_LEN_D]))
    for k in range(max(SEQUENCE_LEN_D -  (len(i)), 0)):
        X_train.append([0]*b_len) # pad token maps to 0

X_train = np.array(X_train)  

X_test = []

for i in df_val['text1']:
    i = sent_tokenize(i)
    X_test.extend(bc.encode(i[:SEQUENCE_LEN_D]))
    for k in range(max(SEQUENCE_LEN_D -  (len(i)), 0)):
        X_test.append([0]*b_len) # pad token maps to 0
X_test = np.array(X_test)


In [0]:
tr_len = int(train_split*len(y_train))
X_train, y_train, X_val, y_val  = X_train[:tr_len*SEQUENCE_LEN_D], y_train[:tr_len], X_train[tr_len*SEQUENCE_LEN_D:], y_train[tr_len:]

In [0]:
X_test = zero_pad_test(X_test, BATCH_SIZE*SEQUENCE_LENGTH_D)
y_test = zero_pad_test(y_test, BATCH_SIZE)

In [0]:
tf.reset_default_graph()
tf.set_random_seed(111)

In [0]:
#Different placeholders
num_classes = y_train.shape[1]
batch_ph = tf.placeholder(tf.float32, [None, b_len])
ind_list_ph = tf.placeholder(tf.int32, [None])
target_ph = tf.placeholder(tf.float32, [None,num_classes])

seq_len_ph_d = tf.placeholder(tf.int32, [None])
keep_prob_ph = tf.placeholder(tf.float32)
doc_size_ph = tf.placeholder(tf.int32,[None])


In [0]:
batch = tf.reshape(batch_ph,[BATCH_SIZE, SEQUENCE_LENGTH_D, b_len])

with tf.variable_scope('document'):
    fw_cell_d = Cell(HIDDEN_SIZE_D)
    bw_cell_d = Cell(HIDDEN_SIZE_D)
    
    fw_cell_d = DropoutWrapper(fw_cell_d, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob=keep_prob_ph,
                             variational_recurrent=True, input_size=batch.get_shape()[-1], 
                             dtype = tf.float32)
    bw_cell_d = DropoutWrapper(bw_cell_d, input_keep_prob=keep_prob_ph, 
                             output_keep_prob=keep_prob_ph,state_keep_prob= keep_prob_ph,
                             variational_recurrent=True, input_size=batch.get_shape()[-1], 
                             dtype = tf.float32)
    
    rnn_outputs_d, _ = bi_rnn(fw_cell_d, bw_cell_d, inputs=batch, 
                              sequence_length=seq_len_ph_d, dtype=tf.float32)
    attention_output_d, alphas_d = attention(rnn_outputs_d, ATTENTION_SIZE_D, seq_len_ph_d, return_alphas=True)

# Dropout
drop = tf.nn.dropout(attention_output_d, keep_prob_ph)

In [0]:
ordinal = True
if ordinal:
    # For ordinal regression, same weights for each class
    W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value], stddev=0.1))
    W_ = tf.transpose(tf.reshape(tf.tile(W,[num_classes - 1]),[num_classes - 1, drop.get_shape()[1].value]))
    b = tf.Variable(tf.cast(tf.range(num_classes - 1), dtype = tf.float32))
    y_hat_ = tf.nn.xw_plus_b(drop, tf.negative(W_), b)

    # Predicted labels and logits
    y_preds, logits = preds(y_hat_,BATCH_SIZE)
    y_true = tf.argmax(target_ph, axis = 1)

    # Ordinal loss
    loss = ordloss_m(y_hat_, target_ph, BATCH_SIZE)
    c = stats.spearmanr
    str_score = "Spearman rank:"
    
# Calculate and clip gradients
max_gradient_norm = 5
lr = 1e-4
params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
optimizer_ = tf.train.AdamOptimizer(learning_rate=lr)
optimizer = optimizer_.apply_gradients(
    zip(clipped_gradients, params))

In [0]:
saver = tf.train.Saver()

In [0]:
train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D)

In [0]:
batch_counter = 0
val_counter = []
config = tf.ConfigProto(inter_op_parallelism_threads=24,
                        intra_op_parallelism_threads=24)
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)
sess.run(tf.global_variables_initializer())

In [0]:
#Main training task

train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D)
val_batch_generator = batch_generator(X_val, y_val, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D)
test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE, seq_len = SEQUENCE_LENGTH_D, shuffle = False)

train_accuracy = []
val_accuracy = []
val_counter = []
val_count = 50
NUM_EPOCHS = 50
doc_size_np = np.array([0]*SEQUENCE_LENGTH_D)
batch_counter = 0

loss_train = 0

print('Training on TOEFL data')
for epoch in range(NUM_EPOCHS):
    print("epoch: {}\t".format(epoch), end="")

    # Training
    num_batches = X_train.shape[0] // (BATCH_SIZE*SEQUENCE_LENGTH_D)
    true = []
    ypreds = []

    for bx in range(num_batches):
        batch_counter += 1
        x_batch, y_batch = next(train_batch_generator)
        seq_len_d = []               
        l = SEQUENCE_LENGTH_D
        for i in range(0,len(x_batch),l):
            for j in range(i,i+l):
                if 0 in x_batch[j]:
                    if list(x_batch[j]).index(0) == 0:
                        seq_len_d.append(j%l)
                        break
                elif j == i+l-1:
                    seq_len_d.append(l)

        seq_len_d = np.array(seq_len_d)

        y_preds_, loss_tr,  _  = sess.run([y_preds, loss,  optimizer],
                                   feed_dict={batch_ph: x_batch,
                                              target_ph: y_batch,
                                              seq_len_ph_d: seq_len_d,
                                              doc_size_ph: doc_size_np,
                                              keep_prob_ph: KEEP_PROB})
        loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
        ypreds.extend(y_preds_)
        t = np.argmax(y_batch, axis = 1)
        true.extend(t)

        sp = c(y_preds_,t)
        if ordinal: 
            sp = sp[0]
        train_accuracy.append(sp)
        
        
        #testing on the validation set            
        if batch_counter%val_count == 0:
            val_counter.append(batch_counter)
            x_batch, y_batch = next(val_batch_generator)
            seq_len_d = []               
            l = SEQUENCE_LENGTH_D
            for i in range(0,len(x_batch),l):
                for j in range(i,i+l):
                    if 0 in x_batch[j]:
                        if list(x_batch[j]).index(0) == 0:
                            seq_len_d.append(j%l)
                            break
                    elif j == i+l-1:
                        seq_len_d.append(l)

            seq_len_d = np.array(seq_len_d)

            y_preds_,loss_t = sess.run([y_preds,loss],
                          feed_dict={batch_ph: x_batch,
                                target_ph: y_batch,
                                seq_len_ph_d: seq_len_d,
                                doc_size_ph: doc_size_np,
                                keep_prob_ph: 1.0})
            ypreds.extend(y_preds_)
            t = np.argmax(y_batch, axis = 1)
            true.extend(t)

            sp = c(y_preds_,t)
            if ordinal: 
                sp = sp[0]
            val_accuracy.append(sp)
            #saver.save(sess, MODEL_PATH, global_step = batch_counter)

    print('training loss: ' + str(loss_train))
    spr = c(true, ypreds)
    if ordinal:
        spr = spr[0]
    print('Training '+ str_score + str(spr))
    print('Val ' + str(np.mean(val_accuracy)))

In [0]:
#testing on the test set
num_batches = X_test.shape[0] // (BATCH_SIZE*SEQUENCE_LENGTH_D)
true = []
ypreds = []

for bx in range(num_batches):
    x_batch, y_batch = next(test_batch_generator)
    seq_len_d = []               
    l = SEQUENCE_LENGTH_D
    for i in range(0,len(x_batch),l):
        for j in range(i,i+l):
            if 0 in x_batch[j]:
                if list(x_batch[j]).index(0) == 0:
                    seq_len_d.append(j%l)
                    break
            elif j == i+l-1:
                seq_len_d.append(l)

    seq_len_d = np.array(seq_len_d)

    y_preds_= sess.run([y_preds],
                  feed_dict={batch_ph: x_batch,
                        target_ph: y_batch,
                        seq_len_ph_d: seq_len_d,
                        doc_size_ph: doc_size_np,
                        keep_prob_ph: 1.0})
    ypreds.extend(y_preds_)
    t = np.argmax(y_batch, axis = 1)
    true.extend(t)



In [0]:
#len(ypreds[0])
ypreds = [j for sub in ypreds for j in sub]

In [0]:
y_test_len = len(df_val)
true = true[:y_test_len]
ypreds = ypreds[:y_test_len]

spr = c(true, ypreds)

if ordinal:
    spr = spr[0]
print('Test set '+ str_score + str(spr))

rank = stats.spearmanr
print('sp rho')
print(rank(true, ypreds))

from sklearn.metrics import cohen_kappa_score as kappa
print('qwk')
print(kappa(true, ypreds, weights="quadratic"))

from scipy.stats import pearsonr
print('pearson')
print(pearsonr(true,ypreds))

print('kappa')
print(kappa(true, ypreds, weights=None))