## Load Dependencies

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn import LSTM, GRU
from torch.nn import Parameter
from torch.nn import MSELoss, L1Loss, SmoothL1Loss, CrossEntropyLoss
from torch.nn.utils.rnn import pad_packed_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from scipy.special import huber
from scipy.stats import spearmanr
#from torch.autograd import Variable

import pandas as pd
import numpy as np
import math
import json
import re
from nltk.tokenize import word_tokenize

from collections import Iterable, defaultdict
from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm_n

import matplotlib.pyplot as plt

sys.path.insert(0, '/home/svashis3/factslab-python/factslab/')
from utility import load_glove_embedding
from datastructures import ConstituencyTree, DependencyTree

torch.manual_seed(1)

<torch._C.Generator at 0x2b666d9ce510>

In [4]:
ud_trees = '../../UD_data_trees/structures.tsv'
data_path = 'data_for_modelling_p1.pkl'
embed_path = '../../'
best_model_file = "best_model_elmo_lstm_2.pth"

## Initiate ELMO class

In [5]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 1, dropout=0, requires_grad=False)  #using 1 layer of representation

## Load sentence mappings

In [6]:
struct_dict = {}

with open(ud_trees, 'r') as f:
    structs_sents = [line.strip().split('\t') for line in f]

for sent_id, tree_list, sent in structs_sents:
    struct_dict[sent_id] = DependencyTree.fromstring(tree_list)
    struct_dict[sent_id].sentence = sent.split(" ")
    
print(struct_dict['en-ud-dev.conllu sent_1'].sentence)

['From', 'the', 'AP', 'comes', 'this', 'story', ':']


## Load the data :

In [7]:
def extract_sentence(sentence_id):
    '''
    Extract the sentence given sent_var
    '''
    temp = sentence_id.split(" ")
    key = temp[0] + " sent_" + temp[1]
    
    return struct_dict[key].sentence

data_all = pd.read_pickle(data_path)
#Extract sentence
data_all["sentence"] = data_all.sentence_id.map(lambda x: extract_sentence(x))
#data_all.head()

### View X and Y

In [8]:
X_train = data_all[data_all.split=="train"][['sentence', 'pred_root_pos']]
X_dev = data_all[data_all.split=="dev"][['sentence', 'pred_root_pos']]

print("Train shape: {}".format(X_train.shape))
print("Dev shape: {}".format(X_dev.shape))

X_train.head()

Train shape: (23538, 2)
Dev shape: (2884, 2)


Unnamed: 0,sentence,pred_root_pos
0,"[Maybe, because, they, hint, at, a, larger, co...",3
1,"[I, read, of, a, case, not, long, ago, when, s...",1
2,"[I, read, of, a, case, not, long, ago, when, s...",12
3,"[I, read, of, a, case, not, long, ago, when, s...",24
4,"[Of, course, law, enforcement, has, dragged, i...",5


In [9]:
Y_train = data_all[data_all.split=="train"]['duration']
Y_dev = data_all[data_all.split=="dev"]['duration']

print(Y_train.shape)
print(Y_dev.shape)

Y_train.head()

(23538,)
(2884,)


0    8
1    8
2    6
3    0
4    6
Name: duration, dtype: int64

## Model Class

In [10]:
class EventTypeRNN(torch.nn.Module):
    def __init__(self, embeddings=None, embedding_size=1024, vocab=None,
                 rnn_class=LSTM, rnn_hidden_size=300, rnn_dropout=0,
                 num_rnn_layers=1, bidirectional=False, attention=False,
                 regression_hidden_sizes=[], output_size=1,
                 device=torch.device(type="cpu"), batch_size=16):
        super().__init__()

        self.device = device
        self.batch_size = batch_size
        self.bidirectional = bidirectional
        self.num_rnn_layers = num_rnn_layers
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn_dropout = rnn_dropout
        self.attention = attention
        self.embedding_size = embedding_size
        
        # initialize model
        self._initialize_rnn(rnn_class, rnn_hidden_size,
                             num_rnn_layers, bidirectional)
        self._initialize_regression(attention,
                                    regression_hidden_sizes,
                                    output_size) 
        
        
        
    def _initialize_rnn(self, rnn_class, rnn_hidden_size,
                        num_rnn_layers, bidirectional):
        
        self.rnn = rnn_class(input_size= self.embedding_size,
                            hidden_size = self.rnn_hidden_size,
                            num_layers= self.num_rnn_layers,
                            bidirectional = self.bidirectional,
                            batch_first = True,
                            dropout = self.rnn_dropout)
        
        if self.bidirectional:
            self.rnn_output_size = self.rnn_hidden_size*2
        else:
            self.rnn_output_size = self.rnn_hidden_size
            
        if self.batch_size > 1:
            self.has_batch_dim = True
        else:
            self.has_batch_dim = False
            
    def _initialize_regression(self, attention, hidden_sizes, output_size):
        self.linear_maps = nn.ModuleList()
        
        last_size = self.rnn_output_size

        for h in hidden_sizes:
            linmap = torch.nn.Linear(last_size, h)
            linmap = linmap.to(self.device)
            self.linear_maps.append(linmap)
            last_size = h

        linmap = torch.nn.Linear(last_size, output_size)
        linmap = linmap.to(self.device)
        self.linear_maps.append(linmap)
    
    
    def forward(self, structures, idxs=None):
        
        #Create elmo dict
        sentences = structures
        character_ids = batch_to_ids(sentences)
        embeddings_dict = elmo(character_ids)
        
        #inputs
        inputs = embeddings_dict['elmo_representations'][0].to(self.device)
        inputs = inputs.detach()  #if it doesn't require back-prop
       
        if not idxs:
            lengths = torch.sum(embeddings_dict['mask'], dim=1).numpy()
        else:
            lengths = None
        
        #pre-process
        inputs = self._preprocess_inputs(inputs)
        
        output, hidden = self._run_rnn(inputs)

        if self.attention:
            output = self._run_attention(output)
        else:
            if self.has_batch_dim:
                output = self._choose_timestamp(output, idxs=idxs, lengths = lengths)
            
        output = self._run_regression(output)

        y_hat = self._postprocess_outputs(output)

        return y_hat
    
    def _preprocess_inputs(self, inputs):
        """Apply some function(s) to the input embeddings
        This is included to allow for an easy preprocessing hook for
        RNNRegression subclasses. For instance, we might want to
        apply a tanh to the inputs to make them look more like features
        """
        return inputs
    
    def _run_rnn(self, inputs):
        '''
        Run desired RNN
        '''
        output, hidden = self.rnn(inputs)
        output = output.to(self.device) 
        
        return output, hidden
    
    
    def _choose_timestamp(self, output, idxs = None, lengths=None):
        # Index extraction for each sequence
        if idxs:
            curr_idxs = torch.from_numpy(np.array(idxs)).to(self.device)
            idx = (curr_idxs).view(-1, 1).expand(output.size(0), output.size(2)).unsqueeze(1).to(self.device)
        else:
            #Choose last time stamp
            curr_idxs = torch.from_numpy(np.array(lengths)).to(self.device)
            idx = (curr_idxs - 1).view(-1, 1).expand(output.size(0), output.size(2)).unsqueeze(1).to(self.device)
            
        return output.gather(1, idx).squeeze()
    
    def _run_regression(self, h_last):
        for i, linear_map in enumerate(self.linear_maps):
            if i:
                h_last = self._regression_nonlinearity(h_last)
            h_last = linear_map(h_last)
        return h_last
    
    def _postprocess_outputs(self, outputs):
        """Apply some function(s) to the output value(s)"""
        return outputs.squeeze()

    def _regression_nonlinearity(self, x):
        return F.tanh(x)     

In [11]:
class EventTypeTrainer(object):
    
    loss_function_map = {"linear": MSELoss,
                         "robust": L1Loss,
                         "robust_smooth": SmoothL1Loss,
                         "multinomial": CrossEntropyLoss}
    
    def __init__(self, regression_type="linear",
                 optimizer_class=torch.optim.Adam,
                 device=torch.device(type="cpu"), 
                 epochs=5,
                 rnn_class=LSTM, **kwargs):
        
        self.epochs = epochs
        self.rnn_class = rnn_class
        self.device = device
    
        self._regression_type = regression_type
        self._optimizer_class = optimizer_class
        self._init_kwargs = kwargs
        self._continuous = regression_type != "multinomial"
        
        
        
    def _initialize_trainer_regression(self):
        self._regression = EventTypeRNN(device=self.device,
                                             rnn_class=self.rnn_class,
                                             **self._init_kwargs)
        
        self._regression = self._regression.to(self.device)
        self.batch_size = self._regression.batch_size
       
        
        self.lf_class = self.__class__.loss_function_map[self._regression_type]

        self._loss_function = self.lf_class()
        self._loss_function = self._loss_function.to(self.device)
        
    def fit(self, X, Y, dev, idxs=None, verbosity=1, **kwargs):
        """Fit the LSTM regression
        Parameters
        ----------
        X : iterable(iterable(object))
            a matrix of structures (independent variables) with rows
            corresponding to a particular kind of RNN
        Y : numpy.array(Number)
            a matrix of dependent variables
        batch_size : int (default: 100)
        verbosity : int (default: 1)
            how often to print metrics (never if 0)
        """

        self._X, self._idxs, self._Y = X, idxs, Y
        dev_x, dev_idxs, dev_y = dev
        
        self._initialize_trainer_regression()  
        
        if not self._continuous:
            class_dict = defaultdict(int)
            for i in self._Y:
                class_dict[i]+=1
            class_weights = sorted([(cl,1/num) for cl, num in class_dict.items()], key=lambda x: x[0])
            class_weights = [y for x,y in class_weights]
            class_weights = torch.FloatTensor(class_weights).to(self.device)
            class_weights = class_weights / class_weights.sum(0).expand_as(class_weights)
            
            self._loss_function = self.lf_class(weight=class_weights)
            self._loss_function = self._loss_function.to(self.device)
        

        print("########## .   Model Parameters   ##############\n")
        for name, param in self._regression.named_parameters():
            if param.requires_grad:
                print(name, param.shape)
        print("\n")
        print("##############################################\n")
        parameters = [p for p in self._regression.parameters() if p.requires_grad]
        optimizer = self._optimizer_class(parameters, **kwargs)
        
        total_obs = len(self._X)
        dev_obs = len(dev_x)
        
        dev_accs = []
        train_accs = []
        best_val_acc = -float('inf')
        
        for epoch in range(self.epochs):
            # Turn on training mode which enables dropout.
            self._regression.train()
            
            bidx_i = 0
            bidx_j =self.batch_size
            
            tqdm.write("Running Epoch: {}".format(epoch+1))
            
            #time print
            pbar = tqdm_n(total = total_obs//self.batch_size)
            
            while bidx_j < total_obs:
                words = self._X[bidx_i:bidx_j]
                
                indexes = self._idxs[bidx_i:bidx_j]
                targets = self._Y[bidx_i:bidx_j]
            
                #Zero grad
                optimizer.zero_grad()

                #Calculate Loss
                predicts = self._regression(words, idxs=indexes)          
                actuals = torch.from_numpy(np.array(targets)).to(self.device)
                
                curr_loss = self._loss_function(predicts, actuals)
                
                #Backpropagate
                curr_loss.backward()
                optimizer.step()
                bidx_i = bidx_j
                bidx_j = bidx_i + self.batch_size
                
                if bidx_j >= total_obs:
                    words = self._X[bidx_i:bidx_j]
                    indexes = self._idxs[bidx_i:bidx_j]
                    targets = self._Y[bidx_i:bidx_j]

                    #Zero grad
                    optimizer.zero_grad()
                    
                    #Calculate Loss
                    predicts = self._regression(words, idxs=indexes)          
                    actuals = torch.from_numpy(np.array(targets)).to(self.device)
                    curr_loss = self._loss_function(predicts, actuals)
                    
                    #Backpropagate
                    curr_loss.backward()
                    optimizer.step()
                    
                pbar.update(1)
                    
            pbar.close()
            
            #train_predicts = self.predict(self._X, idxs=self._idxs) 
            dev_predicts = self.predict(dev_x, idxs=dev_idxs)
            
            #train_acc = spearmanr(train_predicts, Y)
            dev_acc = spearmanr(dev_predicts, dev_y)
            
            del dev_predicts
            
            # Save the model if the validation loss is the best we've seen so far.
            if dev_acc[0] > best_val_acc:
                with open(best_model_file, 'wb') as f:
                    torch.save(self._regression.state_dict(), f)
                best_val_acc = dev_acc[0]
    
            tqdm.write("Epoch: {} Loss: {}".format(epoch+1, curr_loss))
            #tqdm.write("Train spearman correlation: {0:.5f} P-value: {1:.5f}".format(train_acc[0], train_acc[1]))
            tqdm.write("Dev spearman correlation: {0:.5f} P-value: {1:.5f}".format(dev_acc[0], dev_acc[1]))
            tqdm.write("\n")
            dev_accs.append(dev_acc[0])
            #train_accs.append(train_acc[0])
            
        return dev_accs
            
        
    def predict(self, X, idxs=None, batch=1024):
        """Predict using the LSTM regression
        Parameters
        ----------
        X : iterable(iterable(object))
            a matrix of structures (independent variables) with rows
            corresponding to a particular kind of RNN
        """
        # Turn on evaluation mode which disables dropout.
        self._regression.eval()
        
        with torch.no_grad():  
            bidx_i = 0
            bidx_j = batch
            total_obs = len(X)
            predictions = torch.zeros(total_obs, self._init_kwargs['output_size']).to(self.device)
            
            while bidx_j < total_obs:
                words = X[bidx_i:bidx_j]
                indexes = idxs[bidx_i:bidx_j]
                predicts = self._regression(words, idxs=indexes) 
                predictions[bidx_i:bidx_j] = predicts

                bidx_i = bidx_j
                bidx_j = bidx_i + batch

                if bidx_j >= total_obs:
                    words = X[bidx_i:bidx_j]
                    indexes = idxs[bidx_i:bidx_j]
                    predicts = self._regression(words, idxs=indexes) 
                    predictions[bidx_i:bidx_j] = predicts

            predictions = F.softmax(predictions, dim=1)

            _ , predictions =  predictions.max(1)
        
        return predictions

## Data as per model format

In [12]:
t_obs = len(X_train)
#t_obs = 600
X = [i for i, j in X_train[:t_obs].values]
indexes = [j for i, j in X_train[:t_obs].values]
targets = Y_train[:t_obs].values

d_obs = len(X_dev)
#d_obs = 1080
dev_X = [i for i, j in X_dev[:d_obs].values]
dev_indexes = [j for i, j in X_dev[:d_obs].values]
dev_targets = Y_dev[:d_obs].values

In [13]:
dev = (dev_X, dev_indexes, dev_targets)

## Initialize the model and Fit

In [14]:
model = EventTypeTrainer(rnn_class = LSTM, embedding_size = 1024,
                          rnn_hidden_size = 300, num_rnn_layers=2, device=torch.device('cuda')  ,
                         bidirectional=True, epochs=5, attention=False,rnn_dropout=0.5,
                          regression_hidden_sizes = [24], output_size = 11, 
                            batch_size=128, regression_type="multinomial")

In [15]:
dev_accs = model.fit(X, targets, dev, idxs = indexes)

########## .   Model Parameters   ##############

rnn.weight_ih_l0 torch.Size([1200, 1024])
rnn.weight_hh_l0 torch.Size([1200, 300])
rnn.bias_ih_l0 torch.Size([1200])
rnn.bias_hh_l0 torch.Size([1200])
rnn.weight_ih_l0_reverse torch.Size([1200, 1024])
rnn.weight_hh_l0_reverse torch.Size([1200, 300])
rnn.bias_ih_l0_reverse torch.Size([1200])
rnn.bias_hh_l0_reverse torch.Size([1200])
rnn.weight_ih_l1 torch.Size([1200, 600])
rnn.weight_hh_l1 torch.Size([1200, 300])
rnn.bias_ih_l1 torch.Size([1200])
rnn.bias_hh_l1 torch.Size([1200])
rnn.weight_ih_l1_reverse torch.Size([1200, 600])
rnn.weight_hh_l1_reverse torch.Size([1200, 300])
rnn.bias_ih_l1_reverse torch.Size([1200])
rnn.bias_hh_l1_reverse torch.Size([1200])
linear_maps.0.weight torch.Size([24, 600])
linear_maps.0.bias torch.Size([24])
linear_maps.1.weight torch.Size([11, 24])
linear_maps.1.bias torch.Size([11])


##############################################

Running Epoch: 1


HBox(children=(IntProgress(value=0, max=183), HTML(value='')))


Epoch: 1 Loss: 1.8767635822296143
Dev spearman correlation: 0.08799 P-value: 0.00000


Running Epoch: 2


HBox(children=(IntProgress(value=0, max=183), HTML(value='')))


Epoch: 2 Loss: 1.7728654146194458
Dev spearman correlation: 0.08666 P-value: 0.00000


Running Epoch: 3


HBox(children=(IntProgress(value=0, max=183), HTML(value='')))


Epoch: 3 Loss: 1.655367374420166
Dev spearman correlation: 0.08562 P-value: 0.00000


Running Epoch: 4


HBox(children=(IntProgress(value=0, max=183), HTML(value='')))


Epoch: 4 Loss: 1.565590262413025
Dev spearman correlation: 0.06606 P-value: 0.00039


Running Epoch: 5


HBox(children=(IntProgress(value=0, max=183), HTML(value='')))


Epoch: 5 Loss: 1.454863429069519
Dev spearman correlation: 0.07820 P-value: 0.00003




## Plot Performance

In [16]:
# plt.plot(train_accs, 'b-', label="train")
# plt.plot(dev_accs, 'r-', label="dev")
# plt.xlabel("Epochs")
# plt.ylabel("Spearman Correlation")
# plt.title("Performance on Event Durations")
# plt.legend()