In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from re import sub

import torch
import csv
import itertools
import random
from random import shuffle

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split as split_data

In [3]:
class Data(object):
    def __init__(self, data_name, data_file, train_ratio=0.8, max_len=None,
               vocab_limit=None, sentence_cols=None, score_col=None):
        self.data_file = data_file
        self.train_ratio = train_ratio
        self.max_len = max_len
        self.vocab_size = 1
        self.vocab_limit = vocab_limit
        
        if data_name.lower() == 'sick': # if sick dataset
            self.score_col = 'relatedness_score'
            self.sequence_cols = ['sentence_A', 'sentence_B']
        elif data_name.lower() == 'quora': #if quora dataset
            self.score_col = 'is_duplicate'
            self.sequence_cols = ['question1', 'question2']
        else:
            self.score_col = score_col
            self.sequence_cols = question_cols
        
        self.x_train = list()
        self.y_train = list()
        self.x_val = list()
        self.y_val = list()
        self.vocab = set('PAD')
        self.word2index = {'PAD':0}
        self.index2word = {0:'PAD'}
        self.word2count = dict()
        
        self.use_cuda = torch.cuda.is_available()
        self.run()
        
    def text_to_word_list(self, text):
        ''' Pre process and convert texts to a list of words '''
        text = str(text)
        text = text.lower()

        # Clean the text
        text = sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = sub(r"what's", "what is ", text)
        text = sub(r"\'s", " ", text)
        text = sub(r"\'ve", " have ", text)
        text = sub(r"can't", "cannot ", text)
        text = sub(r"n't", " not ", text)
        text = sub(r"i'm", "i am ", text)
        text = sub(r"\'re", " are ", text)
        text = sub(r"\'d", " would ", text)
        text = sub(r"\'ll", " will ", text)
        text = sub(r",", " ", text)
        text = sub(r"\.", " ", text)
        text = sub(r"!", " ! ", text)
        text = sub(r"\/", " ", text)
        text = sub(r"\^", " ^ ", text)
        text = sub(r"\+", " + ", text)
        text = sub(r"\-", " - ", text)
        text = sub(r"\=", " = ", text)
        text = sub(r"'", " ", text)
        text = sub(r"(\d+)(k)", r"\g<1>000", text)
        text = sub(r":", " : ", text)
        text = sub(r" e g ", " eg ", text)
        text = sub(r" b g ", " bg ", text)
        text = sub(r" u s ", " american ", text)
        text = sub(r"\0s", "0", text)
        text = sub(r" 9 11 ", "911", text)
        text = sub(r"e - mail", "email", text)
        text = sub(r"j k", "jk", text)
        text = sub(r"\s{2,}", " ", text)

        text = text.split()

        return text
    
    def load_data(self):
        stops = set(stopwords.words('english'))

        # Load data set
        data_df = pd.read_csv(self.data_file, sep='\t')

        # Iterate over required sequences of provided dataset
        for index, row in data_df.iterrows():
            # Iterate through the text of both questions of the row
            for sequence in self.sequence_cols:
                s2n = []  # Sequences with words replaces with indices
                for word in self.text_to_word_list(row[sequence]):
                    # Remove unwanted words
                    if word in stops:
                        continue

                    if word not in self.vocab:
                        self.vocab.add(word)
                        self.word2index[word] = self.vocab_size
                        self.word2count[word] = 1
                        s2n.append(self.vocab_size)
                        self.index2word[self.vocab_size] = word
                        self.vocab_size += 1
                    else:
                        self.word2count[word] += 1
                        s2n.append(self.word2index[word])

                # Replace |sequence as word| with |sequence as number| representation
                data_df.at[index, sequence] = s2n
        return data_df
    
    def convert_to_tensors(self):
        for data in [self.x_train, self.x_val]:
            for i, pair in enumerate(data):
                data[i][0] = torch.LongTensor(data[i][0])
                data[i][1] = torch.LongTensor(data[i][1])

                if self.use_cuda:
                    data[i][0] = data[i][0].cuda()
                    data[i][1] = data[i][1].cuda()

        self.y_train = torch.FloatTensor(self.y_train)
        self.y_val = torch.FloatTensor(self.y_val)

        if self.use_cuda:
            self.y_train = self.y_train.cuda()
            self.y_val = self.y_val.cuda()
        
    
    def run(self):
        # Loading data and building vocabulary.
        data_df = self.load_data()
        data_size = len(data_df)

        X = data_df[self.sequence_cols]
        Y = data_df[self.score_col]

        self.x_train, self.x_val, self.y_train, self.y_val = split_data(X, Y, train_size=self.train_ratio)

        # Convert labels to their numpy representations
        self.y_train = self.y_train.values
        self.y_val = self.y_val.values

        training_pairs = []
        training_scores = []
        validation_pairs = []
        validation_scores = []

        # Split to lists
        i = 0
        for index, row in self.x_train.iterrows():
            sequence_1 = row[self.sequence_cols[0]]
            sequence_2 = row[self.sequence_cols[1]]
            if len(sequence_1) > 0 and len(sequence_2) > 0:
                training_pairs.append([sequence_1, sequence_2])
                training_scores.append(float(self.y_train[i]))
            i += 1
        self.x_train = training_pairs
        self.y_train = training_scores

        print('Number of Training Positive Samples   :', sum(training_scores))
        print('Number of Training Negative Samples   :', len(training_scores) - sum(training_scores))

        i = 0
        for index, row in self.x_val.iterrows():
            sequence_1 = row[self.sequence_cols[0]]
            sequence_2 = row[self.sequence_cols[1]]
            if len(sequence_1) > 0 and len(sequence_2) > 0:
                validation_pairs.append([sequence_1, sequence_2])
                validation_scores.append(float(self.y_val[i]))
            i += 1

        self.x_val = validation_pairs
        self.y_val = validation_scores

        print('Number of Validation Positive Samples   :', sum(validation_scores))
        print('Number of Validation Negative Samples   :', len(validation_scores) - sum(validation_scores))

        assert len(self.x_train) == len(self.y_train)
        assert len(self.x_val) == len(self.y_val)

        self.convert_to_tensors()

In [5]:
# Embeddings
from gensim.models import KeyedVectors

class Get_Embedding(object):
    def __init__(self, file_path, word_index):
        self.use_cuda = torch.cuda.is_available()
        self.embedding_size = 300 # Dimensionality of Google News' Word2Vec
        self.embedding_matrix = self.create_embed_matrix(file_path, word_index)

    def create_embed_matrix(self, file_path, word_index):
        word2vec = KeyedVectors.load_word2vec_format(file_path, binary=True)

        # Prepare Embedding Matrix.
        embedding_matrix = np.zeros((len(word_index)+1, self.embedding_size))

        for word, i in word_index.items():
            # words not found in embedding index will be all-zeros.
            if word not in word2vec.vocab:
                continue
            embedding_matrix[i] = word2vec.word_vec(word)

        del word2vec

        embedding_matrix = torch.FloatTensor(embedding_matrix)
        if self.use_cuda: embedding_matrix = embedding_matrix.cuda()

        return embedding_matrix

In [22]:
# Defined the LSTM model
import torch.nn as nn
from torch import Tensor
from torch import optim
import torch.nn.functional as F

class Manhattan_LSTM(nn.Module):
    def __init__(self, data_name, hidden_size, embedding, use_embedding=False, train_embedding=True):
        super(Manhattan_LSTM, self).__init__()
        self.data_name = data_name
        self.use_cuda = torch.cuda.is_available()
        self.hidden_size = hidden_size

        if use_embedding:
            self.embedding = nn.Embedding(embedding.shape[0], embedding.shape[1])
            self.embedding.weight = nn.Parameter(embedding)
            self.input_size = embedding.shape[1] # V - Size of embedding vector
        else:
            self.embedding = nn.Embedding(embedding[0], embedding[1])
            self.input_size = embedding[1]

        self.embedding.weight.requires_grad = train_embedding

        self.lstm_1 = nn.LSTM(self.input_size, self.hidden_size, num_layers=1, bidirectional=True)
        self.lstm_2 = nn.LSTM(self.input_size, self.hidden_size, num_layers=1, bidirectional=True)

    def exponent_neg_manhattan_distance(self, x1, x2):
        ''' Helper function for the similarity estimate of the LSTMs outputs '''
        return torch.exp(-torch.sum(torch.abs(x1 - x2), dim=1))

    def forward(self, input_val, hidden):
        '''
        input           -> (2 x Max. Sequence Length (per batch) x Batch Size)
        hidden          -> (2 x Num. Layers * Num. Directions x Batch Size x Hidden Size)
        '''
        embedded_1 = self.embedding(input_val[0]) # L, B, V
        embedded_2 = self.embedding(input_val[1]) # L, B, V

        batch_size = embedded_1.size()[1]

        outputs_1, hidden_1 = self.lstm_1(embedded_1, hidden)
        outputs_2, hidden_2 = self.lstm_2(embedded_2, hidden)

        similarity_scores = self.exponent_neg_manhattan_distance(hidden_1[0].permute(1, 2, 0) .contiguous().view(batch_size, -1),
                                                                 hidden_2[0].permute(1, 2, 0) .contiguous().view(batch_size, -1))

        if self.data_name == 'sick': return similarity_scores*5.0
        else: return similarity_scores

    def init_weights(self):
        ''' Initialize weights of lstm 1 '''
        for name_1, param_1 in self.lstm_1.named_parameters():
            if 'bias' in name_1:
                nn.init.constant_(param_1, 0.0)
            elif 'weight' in name_1:
                nn.init.xavier_normal_(param_1)

        ''' Set weights of lstm 2 identical to lstm 1 '''
        lstm_1 = self.lstm_1.state_dict()
        lstm_2 = self.lstm_2.state_dict()

        for name_1, param_1 in lstm_1.items():
            # Backwards compatibility for serialized parameters.
            if isinstance(param_1, torch.nn.Parameter):
                param_1 = param_1.data

            lstm_2[name_1].copy_(param_1)

    def init_hidden(self, batch_size):
        # Hidden dimensionality : 2 (h_0, c_0) x Num. Layers * Num. Directions x Batch Size x Hidden Size
        # result = torch.zeros(2, 1, batch_size, self.hidden_size)
        if self.use_cuda:
            result = (torch.zeros(2, batch_size, self.hidden_size).cuda(),
                      torch.zeros(2, batch_size, self.hidden_size).cuda())
        else:
            result = (torch.zeros(2, batch_size, self.hidden_size), torch.zeros(1, batch_size, self.hidden_size))

        return result

In [7]:
import math

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

class Helper(object):
    def __init__(self):
        pass

    def as_minutes(self, s):
        m = math.floor(s / 60)
        s -= m * 60
        return '%dm %ds' % (m, s)

    def time_slice(self, since, percent):
        now = time.time()
        s = now - since
        es = s / (percent)
        rs = es - s
        return '%s (- %s)' % (self.as_minutes(s), self.as_minutes(rs))

    def show_plot(self, points):
        plt.figure()
        fig, ax = plt.subplots()
        # this locator puts ticks at regular intervals
        loc = ticker.MultipleLocator(base=0.2)
        ax.yaxis.set_major_locator(loc)
        plt.plot(points)

In [23]:
import time
import random

from nltk import bleu_score

class Run_Iterations(object):
    def __init__(self, data_name, model, x_train, y_train, index2word, batch_size, num_iters,
                 learning_rate, tracking_pair=False, x_val=[], y_val=[], print_every=1, plot_every=1):
        self.use_cuda = torch.cuda.is_available()
        self.data_name = data_name
        self.model = model
        self.batch_size = batch_size
        self.num_iters = num_iters
        self.learning_rate = learning_rate
        self.criterion = nn.MSELoss()

        self.tracking_pair = tracking_pair
        self.print_every = print_every
        self.plot_every = plot_every

        self.index2word = index2word
        ''' Lists that will contain data in the form of tensors. '''
        # Training data.
        self.x_train = x_train
        self.y_train = y_train
        self.train_samples = len(self.x_train)

        # Development data.
        self.x_val = x_val
        self.y_val = y_val
        self.val_samples = len(self.x_val)

        self.help_fn = Helper()

    def train_iters(self):
        start = time.time()
        plot_losses = []
        print_loss_total = 0.0  # Reset every self.print_every
        plot_loss_total = 0.0  # Reset every self.plot_every

        model_trainable_parameters = list(filter(lambda p: p.requires_grad, self.model.manhattan_lstm.parameters()))
        model_optimizer = optim.Adam(model_trainable_parameters, lr=self.learning_rate)

        print('Beginning Model Training.\n')

        for epoch in range(1, self.num_iters + 1):
            for i in range(0, self.train_samples, self.batch_size):
                input_variables = self.x_train[i : i + self.batch_size] # Batch Size x Sequence Length
                similarity_scores = self.y_train[i : i + self.batch_size] # Batch Size

                loss, _ = self.model.train(input_variables, similarity_scores, self.criterion, model_optimizer)
                print_loss_total += loss
                plot_loss_total += loss

            if epoch % self.print_every == 0:
                print_loss_avg = print_loss_total / self.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (self.help_fn.time_slice(start, epoch / self.num_iters),
                                             epoch, epoch / self.num_iters * 100, print_loss_avg))

            if epoch % self.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

            print('Validation Accuracy: %f Validation Precision: %f Validation Recall: %f Validation Loss: %f' % self.get_accuracy())
            print('\n')

            if epoch % 5 == 0:
                self.learning_rate *= 0.80
                model_optimizer = optim.Adam(model_trainable_parameters, lr=self.learning_rate)

        self.help_fn.show_plot(plot_losses)

    def evaluate(self, seqs, scores):
        loss, similarity_scores = self.model.train(seqs, scores, self.criterion, evaluate=True)
        return loss, similarity_scores

    def evaluate_specific(self, seqs, score, name='tracking_pair'):
        sequence1 = [self.index2word[j.item()] for j in seqs[0].view(-1).data]
        sequence2 = [self.index2word[j.item()] for j in seqs[1].view(-1).data]
        print('>', sequence1)
        print('>', sequence2)
        print('=', score.item())

        _, similarity_score = self.evaluate([seqs], score)
        print('<', similarity_score.item())

    def evaluate_randomly(self, n=10):
        for i in range(n):
            ind = random.randrange(self.val_samples)
            self.evaluate_specific(self.x_val[ind], self.y_val[ind], name=str(i))

    def get_accuracy(self):
        true_positive = 0
        true_negative = 0
        false_positive = 0
        false_negative = 0
        total_loss = 0

        accuracy = 0.0
        precision = 0.0
        recall = 0.0

        scale = 1.0
        if self.data_name == 'sick': scale *= 5.0 #if sick dataset

        for i in range(0, self.val_samples, self.batch_size):
            input_variables = self.x_val[i : i + self.batch_size] # Batch Size x Sequence Length
            actual_scores = self.y_val[i : i + self.batch_size] # Batch Size

            loss, predicted_scores = self.model.train(input_variables, actual_scores, self.criterion, evaluate=True)
            total_loss += loss

            for actual, predict in zip(actual_scores, predicted_scores):
                if actual.item()/scale < 0.5 and predict.item() < 0.5:
                    true_negative += 1

                if actual.item()/scale < 0.5 and predict.item() >= 0.5:
                    false_positive += 1

                elif actual.item()/scale >= 0.5 and predict.item() >= 0.5:
                    true_positive += 1

                if actual.item()/scale >= 0.5 and predict.item() < 0.5:
                    false_negative += 1

        accuracy = (true_positive + true_negative)*100/len(self.x_val)
        if true_positive + false_positive > 0: precision = true_positive*100/(true_positive + false_positive)
        if true_positive + false_negative > 0: recall = true_positive*100/(true_positive + false_negative)

        return accuracy, precision, recall, total_loss

In [24]:
import torch.nn.utils.rnn as rnn

class Train_Network(object):
    def __init__(self, manhattan_lstm, index2word):
        self.manhattan_lstm = manhattan_lstm
        self.index2word = index2word
        self.use_cuda = torch.cuda.is_available()

    def train(self, input_sequences, similarity_scores, criterion, model_optimizer=None, evaluate=False):

        sequences_1 = [sequence[0] for sequence in input_sequences]
        sequences_2 = [sequence[1] for sequence in input_sequences]
        batch_size = len(sequences_1)

        '''
        Pad all tensors in this batch to same length.
        PyTorch pad_sequence method doesn't take pad length, making this step problematic.
        Therefore, lists concatenated, padded to common length, and then split.
        '''
        temp = rnn.pad_sequence(sequences_1 + sequences_2)
        sequences_1 = temp[:, :batch_size]
        sequences_2 = temp[:, batch_size:]

        ''' No need to send optimizer in case of evaluation. '''
        if model_optimizer: model_optimizer.zero_grad()
        loss = 0.0

        hidden = self.manhattan_lstm.init_hidden(batch_size)
        output_scores = self.manhattan_lstm([sequences_1, sequences_2], hidden).view(-1)

        loss += criterion(output_scores, similarity_scores)

        if not evaluate:
            loss.backward()
            model_optimizer.step()

        return loss.item(), output_scores

In [16]:
use_cuda = torch.cuda.is_available()

data_name = "quora"
data_file = "./quora.tsv"
embd_file = "./GoogleNews-vectors-negative300.bin.gz"
training_ratio = 0.8
max_len = 20
tracking_pair = False
hidden_size = 50
batch_size = 32
num_iters = 7
learning_rate = 0.003

print('Model Parameters:')
print('Hidden Size                  :', hidden_size)
print('Batch Size                   :', batch_size)
print('Max. input length            :', max_len)
print('Learning rate                :', learning_rate)
print('Number of Epochs             :', num_iters)
print('--------------------------------------\n')

print('Reading Data.')
data = Data(data_name, data_file, training_ratio, max_len)

print('\n')
print('Number of training samples        :', len(data.x_train))
print('Number of validation samples      :', len(data.x_val))
print('Maximum sequence length           :', max_len)
print('\n')

print('Building Embedding Matrix')
embedding = Get_Embedding(embd_file, data.word2index)
embedding_size = embedding.embedding_matrix.shape[1]

Model Parameters:
Hidden Size                  : 50
Batch Size                   : 32
Max. input length            : 20
Learning rate                : 0.003
Number of Epochs             : 7
--------------------------------------

Reading Data.
Number of Training Positive Samples   : 119345.0
Number of Training Negative Samples   : 204022.0
Number of Validation Positive Samples   : 29957.0
Number of Validation Negative Samples   : 50894.0


Number of training samples        : 323367
Number of validation samples      : 80851
Maximum sequence length           : 20


Building Embedding Matrix


In [25]:
print('Building model.')
model = Manhattan_LSTM(data_name, hidden_size, embedding.embedding_matrix, use_embedding=True, train_embedding=True)
if use_cuda: model = model.cuda()

model.init_weights()

print("Training Network.")
train_network = Train_Network(model, data.index2word)

run_iterations = Run_Iterations(data_name, train_network, data.x_train, data.y_train, data.index2word,
                                batch_size, num_iters, learning_rate,
                                tracking_pair=tracking_pair, x_val=data.x_val, y_val=data.y_val)
run_iterations.train_iters()
run_iterations.evaluate_randomly()

torch.save(model.state_dict(), './manhattan_lstm.pt')

Building model.
Training Network.
Beginning Model Training.

5m 13s (- 31m 21s) (1 14%) 3729.8734
Validation Accuracy: 62.947892 Validation Precision: 0.000000 Validation Recall: 0.000000 Validation Loss: 936.348641


11m 27s (- 28m 37s) (2 28%) 2624.2041
Validation Accuracy: 75.315086 Validation Precision: 70.852972 Validation Recall: 56.704610 Validation Loss: 426.905084


17m 58s (- 23m 57s) (3 42%) 1551.5964
Validation Accuracy: 76.083165 Validation Precision: 70.237823 Validation Recall: 61.518176 Validation Loss: 415.143936


24m 26s (- 18m 20s) (4 57%) 1359.9104
Validation Accuracy: 75.846928 Validation Precision: 68.331869 Validation Recall: 64.882999 Validation Loss: 421.806081


30m 50s (- 12m 20s) (5 71%) 1240.0588
Validation Accuracy: 75.673770 Validation Precision: 68.043913 Validation Recall: 64.759489 Validation Loss: 428.341028


37m 1s (- 6m 10s) (6 85%) 1121.8868
Validation Accuracy: 76.177165 Validation Precision: 69.309648 Validation Recall: 64.078513 Validation Los