In [2]:
import datetime
import gensim
import numpy as np
import os.path
import pandas as pd
import torch, stanza
from torch import nn
import torch.nn.functional as F
import threading


# Data Processing

This section can have stuff related to data prep.


Should the MSPC Dataset be a part of this section? - Adam

In [3]:
def read_file(file_name):
    # Note: Unable to use pd.read_csv... the function complained about an issue with the formatting of the tsv file
    # train = pd.read_csv('data/msr_paraphrase_train.txt', sep='\t', encoding='latin1')
    # train

    # opting to read file in and split columns manually to create a pandas dataframe
    list = []
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            fields = line.split('\t')
            list.append(fields)

    df = pd.DataFrame(list[1:], columns=['Quality', 'ID1', 'ID2', 'String1', 'String2'])
    return df


# Model Definition

![Model Overview](./images/overview.png)


## Input Layer

In the input layer is made up of a Stanford Parser to provide a syntactic tree so that the model can extract significant words (mainly, subject, predicate, object) in the input corpus. Word2Vec is then used to map the words into vectors

In [4]:
# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None, use_gpu=False)

def trunk_construction(str, parent_label = None):
    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = construct_sentence(tree, parent_label)
    return ' '.join(words)

def construct_sentence(tree, parent_label = None, leave_pos=False):

    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + construct_sentence(child, tree.label)

    return sentences

def find_branches(tree, label, not_in_label=None, ancestors=[]):
    branches = []
    # print("-------------")
    # print(ancestors)
    # print(f"{tree.label} == {label}")
    if tree.label == label and not_in_label not in ancestors:
        # print(f"adding {tree}")
        branches.append(tree)
    for child in tree.children:
        branches = branches + find_branches(child, label, not_in_label, ancestors + [tree.label])

    return branches

#
# # According to the paper the subject is the first NN child of NP
def find_subject(noun_phrase_for_subject):
    subject = []
    for child in noun_phrase_for_subject.children:
        if 'NN' in child.label:
            subject = subject + child.leaf_labels()

    #print(f"subject = {subject}")
    #if len(subject) > 0:
    #    return ' '.join(subject)
    return subject

    return None

def find_predicate(verb_phrase_for_predicate):
    predicate = []
    for child in verb_phrase_for_predicate.children:
        if child.label.startswith('VB'):
            predicate = predicate + child.leaf_labels()

    if len(predicate) > 0:
        return ' '.join(predicate)

    return None

def find_object(verb_phase_for_object, parent='VP'):
    objects = []
    for child in verb_phase_for_object.children:
        if child.label == 'VP':
            continue
        if 'NN' in child.label and parent in ['NP', 'PP', 'ADJP']:
            #objects = objects + child.leaf_labels()
            new_objects = child.leaf_labels()
            for new_object in new_objects:
                if new_object not in objects:
                    objects.append(new_object)
        else:
            new_objects = find_object(child, child.label)
            #if new_objects not in objects and new_objects is not None:
            for new_object in new_objects:
                if new_object not in objects:
                    objects.append(new_object)
                #objects = objects + new_objects

    return objects
    # if len(objects) > 0:
    #     #return ' '.join(objects)
    #     return objects
    # else:
    #     return None

def find_spo(tree):
    noun_phrases_for_subject = find_branches(tree, label='NP', not_in_label='VP', ancestors=[])
    subject_list = []
    for noun_phrase_for_subject in noun_phrases_for_subject:
        subject = find_subject(noun_phrase_for_subject)
        #if subject is not None:
        #   subject_list.append(subject)
        subject_list = subject_list + subject

    verb_phrases = find_branches(tree, label='VP')
    predicate_list = []
    object_list = []
    for verb_phrase in verb_phrases:
        predicate = find_predicate(verb_phrase)
        if predicate is not None:
            predicate_list.append(predicate)
        object = find_object(verb_phrase)
        object_list = object_list + object
        #if object is not None:
        #    object_list.append(object)

    # dedupe list
    subject_list = list(dict.fromkeys(subject_list))
    predicate_list = list(dict.fromkeys(predicate_list))
    object_list = list(dict.fromkeys(object_list))

    return subject_list, predicate_list, object_list

2023-04-12 12:04:43 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-12 12:04:43 INFO: Using device: cpu
2023-04-12 12:04:43 INFO: Loading: tokenize
2023-04-12 12:04:43 INFO: Loading: pos
2023-04-12 12:04:44 INFO: Loading: constituency
2023-04-12 12:04:44 INFO: Done loading processors!


In [5]:
 def test_parser(str, valid_subject, valid_predicate, valid_object):

    #new_sentence = trunk_construction(str)
    #print(new_sentence)
    #assert new_sentence == valid_sentence

    doc = nlp(str)
    tree = doc.sentences[0].constituency

    subject_list, predicate_list, object_list = find_spo(tree)
    print(f"Subject = {' '.join(subject_list)}")
    print(f"Predicate = {' '.join(predicate_list)}")
    print(f"Object = {' '.join(object_list)}")

    print(f"{subject_list} = {valid_subject}")
    print(f"{predicate_list} = {valid_predicate}")
    print(f"{object_list} = {valid_object}")
    assert subject_list == valid_subject
    assert predicate_list == valid_predicate
    assert object_list == valid_object



## Parser Test Cases
Test the parser using some of the training data sentences as input and asserting the output sentence matches the algorithm defined in the paper.

In [6]:
test_parser(""""We have an incredible amount of work to do, but it is not in [designing new] instruction set architectures.""", [], ['have', 'do', 'is', 'designing'], ['amount', 'work', 'set', 'architectures'])
test_parser("""We have got an incredible amount of work to do, but it ain't in the instruction set," he said.""",  ['instruction', 'set'], ['have', 'got', 'do', 'said'], ['amount', 'work'])
test_parser('Syrian forces launch new attacks', ['forces'], ['launch'], ['attacks'])
test_parser("""the flat tire was replaced by the driver""", ['tire'], ['was', 'replaced'], ['driver'])
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
            ['Amrozi'], ['accused', 'called', 'distorting'], ['brother',  'witness', 'evidence'])
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
         ['Amrozi'], ['Referring', 'accused', 'distorting'], ['witness', 'brother', 'evidence'])
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            ['Shares', 'Genentech', 'company', 'products', 'market'], ['rose'], ['percent'])

test_parser("""Shares of Xoma fell 16 percent in early trade, while shares of Genentech, a much larger company with several products on the market, were up 2 percent.""", ['Shares', 'Xoma'], ['fell', 'were'], ['percent', 'trade', 'shares', 'Genentech', 'company', 'products', 'market'])

test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
              ['Gyorgy', 'Heizler', 'head', 'disaster', 'unit'], ['said', 'was', 'carrying'], ['coach', 'passengers'])
test_parser("""The head of the local disaster unit, Gyorgy Heizler, said the coach driver had failed to heed red stop lights.""",
            ['head', 'disaster', 'unit', 'Gyorgy', 'Heizler'], ['said', 'had', 'failed', 'heed'], ['coach', 'driver', 'stop', 'lights'])
# test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
#             "wife said was percent George Bush looked using years training war")
test_parser("""Sheena Young of Child, the national infertility support network, hoped the guidelines would lead to a more "fair and equitable" service for infertility sufferers""", ['Sheena', 'Young', 'Child', 'network'], ['hoped', 'lead'], ['guidelines', 'service', 'infertility', 'sufferers'])
test_parser("""Sheena Young, for Child, the national infertility support network, said the proposed guidelines should lead to a more "fair and equitable" service for infertility sufferers.""", ['Sheena', 'Young', 'Child', 'network'], ['said', 'lead'], ['guidelines', 'service', 'infertility', 'sufferers'])
#
# test_parser("""Among CNN viewers, 29 percent said they were Republicans and 36 percent called themselves conservatives.""",
#             "CNN viewers percent said were Republicans percent called conservatives")
# test_parser("""Out of Fox viewers, 41 percent describe themselves as Republicans, 24 percent as Democrats and 30 percent as Independents""",
#             "Fox viewers percent describe Republicans percent Democrats percent Independents")

# Note: stanza parser has a problem with the below sentence.  It is unable to parse it correctly
# test_parser("""Sheena Young, for Child, the national infertility support network, said the proposed guidelines should lead to a more "fair and equitable" service for infertility sufferers.""", "")
# test_parser("""Among Fox viewers, 41 percent describe themselves as Republicans, 24 percent as Democrats and 30 percent as Independents""", "Fox viewers percent describe Republicans percent Democrats percent Independents")

Subject = 
Predicate = have do is designing
Object = amount work set architectures
[] = []
['have', 'do', 'is', 'designing'] = ['have', 'do', 'is', 'designing']
['amount', 'work', 'set', 'architectures'] = ['amount', 'work', 'set', 'architectures']
Subject = instruction set
Predicate = have got do said
Object = amount work
['instruction', 'set'] = ['instruction', 'set']
['have', 'got', 'do', 'said'] = ['have', 'got', 'do', 'said']
['amount', 'work'] = ['amount', 'work']
Subject = forces
Predicate = launch
Object = attacks
['forces'] = ['forces']
['launch'] = ['launch']
['attacks'] = ['attacks']
Subject = tire
Predicate = was replaced
Object = driver
['tire'] = ['tire']
['was', 'replaced'] = ['was', 'replaced']
['driver'] = ['driver']
Subject = Amrozi
Predicate = accused called distorting
Object = brother witness evidence
['Amrozi'] = ['Amrozi']
['accused', 'called', 'distorting'] = ['accused', 'called', 'distorting']
['brother', 'witness', 'evidence'] = ['brother', 'witness', 'evidence

## Concurrency Parsing
Added support for concurrent parsing.  This can help in the performance of the preprocessing

In [7]:
class SentenceProcessingThread(threading.Thread):
    def __init__(self, sentences, output_list, begin, end):
        super(SentenceProcessingThread, self).__init__()
        self.sentences = sentences
        self.nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None, use_gpu=True)
        self.output_list = output_list
        self.begin = begin
        self.end = end

    def trunk_construction(self, str, parent_label = None):
        doc = self.nlp(str)
        tree = doc.sentences[0].constituency

        #words = construct_sentence(tree, parent_label)
        #return ' '.join(words)

        subjects, predicates, objects = find_spo(tree)

        return f"{' '.join(subjects)},{' '.join(predicates)},{' '.join(objects)}"

    def run(self):
        print(f"going to process {self.begin} to {self.end}")
        for i, sentence in enumerate(self.sentences):
            new_sentence = self.trunk_construction(sentence)
            self.output_list[self.begin + i] = new_sentence

def process_sentences_concurrently(sentences, output, p=2):
    total = len(sentences)
    interval = int(total / p)
    threads = []
    for i in range(p):
        s = i*interval
        if i == p-1:
            e = total
        else:
            e = (i+1) * interval
        sentences_slice = sentences[s:e]
        sentence_thread = SentenceProcessingThread(sentences_slice, output, s, e)
        sentence_thread.start()
        threads.append(sentence_thread)

    for thread in threads:
        thread.join()

def preprocess_corpus(input_file='data/msr_paraphrase_train.txt', output_file='data/msr_paraphrase_train_stanza.txt', N=None):
    print(output_file)
    if os.path.exists(output_file):
        print(f"{output_file} already exists")
        return

    starttime = datetime.datetime.now()
    df = read_file(input_file)

    if N is None:
        N = len(df.String1)

    output1 = [None] * N
    output2 = [None] * N

    # we can process with more threads if we only have CPU
    p = 8

    if torch.cuda.is_available():
        # if cuda is available we don't need that many threads
        # and if the number of threads is set too large using cuda
        # we can get out of memory exceptions
        p = 2
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String1[:N], output1, p)

    # try and be careful with gpu memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String2[:N], output2, p)

    endtime = datetime.datetime.now()

    print(f"time to process {N*2} sentences is {endtime - starttime}")

    stanza_df = df[:N]

    processed_string1 = pd.Series(output1)
    # processed_string1.apply(gensim.utils.simple_preprocess)
    processed_string2 = pd.Series(output2)
    #processed_string2.apply(gensim.utils.simple_preprocess)

    stanza_df.String1 = processed_string1
    stanza_df.String2 = processed_string2

    # write the file out.  This can help in the future
    print(f"about to write out {output_file}")
    stanza_df.to_csv(output_file, sep="\t")


## Sentence Preprocessing
pass the input sentences from the training dataset through the stanford/stanza parser, extracting the relevant parts of speech and then tokenize the processed sentences using the gensim.utils.simple_preprocess utility

In [8]:
# start_time = datetime.datetime.now()
#
# processed_string1 = df[:500].String1.apply(trunk_construction)
# processed_string2 = df[:500].String2.apply(trunk_construction)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with stanza library took {end_time - start_time}")
#
# start_time = datetime.datetime.now()
#
# processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
# processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
# print(f"Number of sentences processed in the String1 column: len(processed_string1")
# print(f"Number of sentences processed in the String2 column: len(processed_string2")

# Word2Vec Embeddings
Take the preprocessed and tokenized sentences and use Word2Vec to get the word embeddings.  Take each word embedding in a sentence and find the mean which will represent the embedding for the sentence.

In [9]:
# from gensim.models import Word2Vec
#
# corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)
#
# #model = Word2Vec(sentences=corpus, vector_size=(len(processed_string1) + len(processed_string2)), min_count=2)
# # set vector size to reduce computational complexity
# model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)
# #model.build_vocab(sentences=corpus)
# #model.train(corpus, total_examples=model.corpus_count, epochs=5)
#
# print(model)
# print(model.wv.key_to_index)
#
# model.wv.get_vector('president')
#
# # for index, word in enumerate(model.wv.index_to_key):
# #     if index == 120:
# #         break
# #     print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

In [10]:
from gensim.models import Word2Vec

# Function is broken out for testing purposes
def generate_word2vec_model(corpus):
    # Creating the Word2Vec model
    model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)

    return model


In [11]:
# Function is broken out for testing purposes
def sentence_embeddings(w2v_model, sentence, size):
    np_embedding = np.zeros(size)
    for i, word in enumerate(sentence):
        #print(word)
        np_embedding[i] = w2v_model.wv.get_vector(word)

    return np_embedding
    # list = []
    # for word in sentence:
    #     list.append(w2v_model.wv.get_vector(word))
    #
    # word_matrix = np.row_stack(list)
    # #return np.mean(word_matrix, axis=0)
    # return word_matrix

In [12]:
def test_word2vec():

    df = read_file('data/msr_paraphrase_train.txt')

    sentences1 = df.String1[:5].apply(gensim.utils.simple_preprocess)
    sentences2 = df.String2[:5].apply(gensim.utils.simple_preprocess)

    corpus = pd.concat([sentences1, sentences2], ignore_index=True)

    max_sentence_len = corpus.apply(len).max()

    model = generate_word2vec_model(corpus)

    embedding = sentence_embeddings(model, corpus[0], (max_sentence_len, 50))
    assert embedding.shape == (max_sentence_len, 50)

    # Not sure if this will always generate the same embedding
    # test_embedding = np.array([-0.00113049, -0.00124808,  0.00252251,  0.00058141,  0.00187964,  0.00379025,
    #           -0.00012356,  0.00347055, -0.00241507,  0.00545258, -0.00574078, -0.00489824,
    #           -0.00224492,  0.00744946,  0.00350835, -0.00139295, -0.00081134,  0.00655962,
    #           0.00244374, -0.00447209, -0.00124291, -0.00092616,  0.0021044,  -0.00092541,
    #           0.00284307,  0.00367638,  0.00364716,  0.00519976, -0.00088121,  0.00109841,
    #           -0.00219322, -0.00372483,  0.00078702, -0.00612309, -0.00312131,  0.00088071,
    #           0.00503909, -0.0009484,  -0.00068209, -0.0004782,   0.00367015,  0.00314679,
    #           -0.00302592,  0.00346377,  0.00151145, -0.00076442, -0.0012528,  -0.00087095,
    #           -0.00075365,  0.00468711])
    #
    #
    # # not sure if this will always be equal based on comment on test_embedding variable
    # assert np.allclose(embedding, test_embedding)

test_word2vec()

In [13]:
def init_word2vec(train_input_file, test_input_file):

    file_parts = os.path.splitext(train_input_file)
    train_output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
    print("About to preprocess data")
    preprocess_corpus(input_file=train_input_file, output_file=train_output_file)
    print("Done preprocessing data")

    file_parts = os.path.splitext(test_input_file)
    test_output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
    print("About to preprocess data")
    preprocess_corpus(input_file=test_input_file, output_file=test_output_file)
    print("Done preprocessing data")


    train_df = pd.read_csv(train_output_file, sep="\t")
    test_df = pd.read_csv(test_output_file, sep="\t")

    # train_df = read_file(train)
    # test_df = read_file(test)

    train_sentences1 = train_df.String1.apply(gensim.utils.simple_preprocess)
    train_sentences2 = train_df.String2.apply(gensim.utils.simple_preprocess)
    test_sentences1 = test_df.String1.apply(gensim.utils.simple_preprocess)
    test_sentences2 = test_df.String2.apply(gensim.utils.simple_preprocess)

    corpus = pd.concat([train_sentences1, train_sentences2, test_sentences1, test_sentences2], ignore_index=True)
    max_sentence_len = corpus.apply(len).max()

    word2vec = generate_word2vec_model(corpus)


    return word2vec, max_sentence_len


In [14]:
def corpus_embeddings(model, corpus, max_sentence_len):
    corpus_size = len(corpus)
    embeddings_list = []
    embedding_matrix = np.zeros((corpus_size, max_sentence_len, 50))
    for i, sentence in enumerate(corpus):
        embeddings = sentence_embeddings(model, sentence, size=(max_sentence_len, 50))
        embedding_matrix[i] = embeddings
        embeddings_list.append(embeddings)

    return embedding_matrix

In [82]:
from torch.utils.data import DataLoader, Dataset

# Dataset for the MSPC dataset
class MSPCDataset(Dataset):
    """
    Arguments:
        tsv_file (string): path to the tsv file with sentences to compare and associate quality score
        num_records (int): number of records to load.  Defaults to None which is all
    """
    def __init__(self, tsv_file, w2v_model, max_sentence_length, num_records=None):

        self.max_sentence_len = max_sentence_length
        self.w2v_model = w2v_model

        file_parts = os.path.splitext(tsv_file)
        output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
        print("About to preprocess data")
        preprocess_corpus(input_file=tsv_file, output_file=output_file)
        print("Done preprocessing data")


        #df = read_file('data/msr_paraphrase_train.txt')
        df = pd.read_csv(output_file, sep="\t")

        if num_records is not None:
            processed_string1 = df[:num_records].String1
            processed_string2 = df[:num_records].String2
            self.quality = df[:num_records].Quality
        else:
            processed_string1 = df.String1
            processed_string2 = df.String2
            self.quality = df.Quality

        processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
        processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)

        #corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)


        #self.max_sentence_len = corpus.apply(len).max()

        #w2v_model = generate_word2vec_model(corpus)

        sentence_embeddings1 = corpus_embeddings(self.w2v_model, processed_string1, max_sentence_len=self.max_sentence_len)
        sentence_embeddings2 = corpus_embeddings(self.w2v_model, processed_string2, max_sentence_len=self.max_sentence_len)

        #self.w2v_model = w2v_model
        self.sentences_embeddings1 = sentence_embeddings1
        self.sentences_embeddings2 = sentence_embeddings2

        # print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
        print(f"Number of sentences processed in the String1 column: {len(processed_string1)}")
        print(f"Number of sentences processed in the String2 column: {len(processed_string2)}")
        #print(self.sentences_embeddings1)

    def __len__(self):
        return len(self.sentences_embeddings1)

    def __getitem__(self, i):
        #return torch.FloatTensor(np.stack((self.sentences_embeddings1[i], self.sentences_embeddings2[i]))), self.quality[i]
        return torch.FloatTensor(self.sentences_embeddings1[i]), torch.FloatTensor(self.sentences_embeddings2[i]), self.quality[i]

    def get_max_sentence_length(self):
        return self.max_sentence_len

In [83]:
def test_dataset():
    word2vec, max_sentence_length = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt')
    dataset = MSPCDataset('data/msr_paraphrase_train.txt', word2vec, max_sentence_length, 10)
    assert len(dataset) == 10

    x1, y1 = dataset[0]
    assert x1.shape[0] == 2
    assert x1.shape[1] == max_sentence_length
    assert x1.shape[2] == 50

    # for set in dataset:
    #     print(len(set[0]), len(set[1]))
    # print(len(dataset[0][0]))
    # print(dataset[0])
test_dataset()

About to preprocess data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing data
About to preprocess data
data/msr_paraphrase_test_stanza.txt
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
About to preprocess data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing data
Number of sentences processed in the String1 column: 10
Number of sentences processed in the String2 column: 10


ValueError: too many values to unpack (expected 2)

## Dataloaders
Create training and test dataloaders

In [84]:
word2vec, max_sentence_length = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt')
train_dataset = MSPCDataset('data/msr_paraphrase_train.txt', word2vec, max_sentence_length)
test_dataset = MSPCDataset('data/msr_paraphrase_test.txt', word2vec, max_sentence_length)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

About to preprocess data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing data
About to preprocess data
data/msr_paraphrase_test_stanza.txt
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
About to preprocess data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing data
Number of sentences processed in the String1 column: 4076
Number of sentences processed in the String2 column: 4076
About to preprocess data
data/msr_paraphrase_test_stanza.txt
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
Number of sentences processed in the String1 column: 1725
Number of sentences processed in the String2 column: 1725


In [19]:
def conv_output_volume(W, F, S, P):

    """
    TODO: Given the input volume size $W$, the kernel/filter size $F$,
    the stride $S$, and the amount of zero padding $P$ used on the border,
    calculate the output volume size.
    Note the output should a integer.
    """

    # your code here
    #https://cs231n.github.io/convolutional-networks/

    return int((W-F+2*P)/S+1)

In [20]:
print(conv_output_volume(50, 3, 1, 1))

50


## Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [21]:
# #https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8dc
# """
# TODO: Remove hardcoded values and max it dynamic.
# """
# class DynamicKMaxPooling(nn.Module):
#     def __init__(self, k_init, conv_layers, layer):
#         super().__init__()
#         # "L is the total  number  of  convolutional  layers
#         # in  the  network;
#         # ktop is the fixed pooling parameter for the
#         # topmost  convolutional  layer"
#         self.k_init = k_init
#         self.conv_layers = conv_layers
#         self.layer = layer
#
#     def forward(self, X):
#         s = 50
#         dyn_k = ((self.conv_layers - self.layer) / self.conv_layers) * 3
#         k_max = int(round(max(self.k_init, np.ceil(dyn_k))))
#         print(k_max)
#         out = F.max_pool1d(X, kernel_size=k_max)
#         return out

## Sentence Similarity Convolution Network (SSCN)

### Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [22]:
#https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8dc
"""
TODO: Remove hardcoded values and max it dynamic.
"""
class DynamicKMaxPooling(nn.Module):
    def __init__(self, k_init, conv_layers):
        super().__init__()
        # "L is the total  number  of  convolutional  layers
        # in  the  network;
        # ktop is the fixed pooling parameter for the
        # topmost  convolutional  layer"
        self.k_init = k_init
        self.conv_layers = conv_layers

    def pool(self, X, l):
        # s is sequence length
        # l is current layer in network
        s = X.shape[2]
        dyn_k = ((self.conv_layers - l) / self.conv_layers) * s
        k_max = int(round(max(self.k_init, np.ceil(dyn_k))))
        return F.max_pool1d(X, kernel_size=k_max)

    def forward(self, X):
        for layer_i in range(self.conv_layers,0,-1):
            X = self.pool(X, layer_i)

        return X

Testing Dynamic K-Max Pooling Layer

In [23]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
OUTPUT_SIZE = 15

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, OUTPUT_SIZE))
dyn_k_layer = DynamicKMaxPooling(3, SAMPLE_SIZE)

# Call forward with convolution layer index [2,1]
out = dyn_k_layer(test_embedding)

assert out.shape[2] == 1
assert out.shape[1] == SAMPLE_SIZE
assert out.shape[0] == NUM_OF_SAMPLES

### Sentence Similarity

 \begin{align*} Man(\vec V_{x}, \vec V_{y})=&\left |{ x_{1}-y_{1} }\right |\! +\! \left |{ x_{2}-y_{2} }\right | \!+ \!\ldots \!+ \!\left |{ x_{n}-y_{n} }\right |
 \\ score=&e^{-Man(\vec V_{x}, \vec V_{y})},\quad score\in [{0,1}] \end{align*}

In [24]:
"""
* X: Pooled output of SSCN model of shape (sample_size, -1)
* For the purpose of this experiment sample_size = 2
"""
def manhattan_similarity_score(X):
    sample_count, _, M = X.shape
    Vx = X[:,0].reshape((sample_count,M))
    Vy = X[:,1].reshape((sample_count,M))
    mdist = torch.sum(torch.abs(Vx-Vy),dim=1).view(sample_count,-1)
    score = torch.exp(-1*mdist)
    return score

In [59]:
class SentenceSimilarityCNN(nn.Module):
    def __init__(self, max_sentence_size, stride=1, kernel_size=3, padding=1):
        super(SentenceSimilarityCNN, self).__init__()
        self.max_sentence_size = max_sentence_size
        self.stride = stride
        self.kernel_size = kernel_size
        self.padding = padding

        self.conv1 = nn.Conv2d(in_channels=2, out_channels=2, kernel_size=self.kernel_size, padding=self.padding)
        self.activation = nn.ReLU()

        # [64, 2, 19, 50] -> [B, num_sent, max_sent_len, embedding_size]
        out1 = ((19 - self.kernel_size + 2*self.padding)/self.stride) + 1
        out2 = ((50 - self.kernel_size + 2*self.padding)/self.stride) + 1

        print(f"out1: {out1} & out2: {out2}")

        # TODO: change this to k-max pooling
        self.pooling = nn.MaxPool2d(kernel_size=3)

        pool_out1 = (out1 - self.kernel_size) + 1

        pool_out2 = (out2 - self.kernel_size) + 1

        print(f"pool_out1: {pool_out1} & pool_out2: {pool_out2}")


        output_volume = conv_output_volume(50, self.kernel_size, self.padding, self.stride)
        print(f"output_volume = {output_volume}")
        self.fc = nn.Linear(in_features=192, out_features=100)

    def forward(self, x):
        #print(f"shape before convolution {x.shape}")
        x = self.conv1(x)
        #print(f"shape after convolution {x.shape}")

        x = self.activation(x)
        x = self.pooling(x)
        #print(f"shape after pooling {x.shape}")

        # permute_x = torch.permute(x, (1, 0, 2, 3))
        # x1 = permute_x[0]
        # x2 = permute_x[1]

        x = x.view(x.size()[0], -1)

        x = self.fc(x)
        x = x.reshape(x.shape[0], 2, 50)
        x = torch.permute(x, (1, 0, 2))
        man_dist = torch.sum(torch.abs(x[0] - x[1]), axis=1)
        # sentence1_mean = torch.mean(x1, axis=1)
        # sentence2_mean = torch.mean(x2, axis=1)
        # man_dist = torch.sum(torch.abs(sentence1_mean - sentence2_mean), axis=1)
        # print(man_dist.shape)

        return torch.exp(-man_dist)

In [146]:

model = SentenceSimilarityCNN(train_dataset.get_max_sentence_length())
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
n_epochs=500
def train(train_loader, n_epochs=n_epochs):
    model.train()
    for epoch in range(n_epochs):
        curr_epoch_loss = []
        for sentence_pairs, y in train_loader:
            y_hat = model(sentence_pairs)
            # print("training")
            # print(y_hat)
            # print(y)
            # print("======")
            loss = criterion(y_hat, y.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            curr_epoch_loss.append(loss.cpu().data.numpy())

        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")

    return model

start_time = datetime.datetime.now()
model = train(train_dataloader)
end_time = datetime.datetime.now()
print("Number of epochs: ", n_epochs)
print ("Training dataset size", len(train_dataset))
print("Training time: ", (end_time - start_time))


out1: 19.0 & out2: 50.0
pool_out1: 17.0 & pool_out2: 48.0
output_volume = 50


ValueError: too many values to unpack (expected 2)

In [65]:
def eval_model(model, test_dataloader):
    model.eval()
    for sentence_pairs, y in test_dataloader:
        y_hat = model(sentence_pairs)
        print(y_hat)
        y_pred = torch.zeros(y_hat.shape)
        y_pred = (y_hat > 0.7).int()
        print(y_pred)
        print(y)
        break

eval_model(model, train_dataloader)

tensor([0.1844, 0.1844, 0.1844, 0.1844, 0.1844, 0.1769, 0.1844, 0.1844, 0.1844,
        0.1844, 0.1851, 0.1806, 0.2096, 0.1844, 0.1844, 0.1835, 0.1844, 0.1844,
        0.1575, 0.1667, 0.1869, 0.1844, 0.1844, 0.1868, 0.1844, 0.2586, 0.1844,
        0.1844, 0.1844, 0.1754, 0.1757, 0.1879, 0.1844, 0.2246, 0.1844, 0.1844,
        0.1844, 0.1844, 0.1844, 0.1844, 0.1844, 0.1844, 0.1948, 0.1844, 0.1844,
        0.1844, 0.1844, 0.1844, 0.2117, 0.1844, 0.1844, 0.1844, 0.1688, 0.1844,
        0.1948, 0.1958, 0.1741, 0.1844, 0.1844, 0.1971, 0.1867, 0.1844, 0.1844,
        0.1844], grad_fn=<ExpBackward0>)
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)
tensor([1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
        0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
   

In [155]:
class KMaxPool1d(nn.Module):
    def __init__(self, k):
        super(KMaxPool1d, self).__init__()
        self.k = k

    def forward(self, x):
        # input shape (batch_size, num_channels, sequence_length)
        # output shape (batch_size num_channels, k)
        k_max_values, k_max_indices  = torch.topk(x, self.k, dim=2)
        return k_max_values

In [None]:
class DynamicKMaxPoolId(nn.Module):
    def __init__(self, k, l, L):
        super(DynamicKMaxPoolId, self).__init__()
        self.k = k
        self.l = l
        self.L = L

    def forward(self, x, sentence_length, l, L):
        ktop = max((self.L - self.l)/self.L * sentence_length)
        k_max_values, k_max_indices = torch.topk(x, ktop, dim=2)
        return k_max_values


In [173]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentenceSimilarityCNN2(nn.Module):
    def __init__(self, embedding_dim, num_filters, filter_size, hidden_dim, dropout_prob=0.5):
        super(SentenceSimilarityCNN2, self).__init__()
        #self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # self.conv_layers = nn.ModuleList([
        #     nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs)
        #     for fs in filter_sizes
        # ])

        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_size)
        #self.conv2 = nn.Conv1d(in_channels= num_filters, out_channels=, kernel_size=filter_size)

        self.k = 3
        self.kmaxPool1d = KMaxPool1d(k=3)

        self.fc1 = nn.Linear(self.k * num_filters , hidden_dim)


        #self.fc1 = nn.Linear(num_filters * len(filter_size), hidden_dim)
        #self.dropout = nn.Dropout(dropout_prob)
        #self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, input1_embedded, input2_embedded):

        # input: input1_embedded is sentence1 and input2_embedding is sentence2
        # input shape: (batch_size, max_sentence_length, embedding_size)
        #output shap: (50) dimension vector that represents the sentence
        # output: similarity score
        sent_length1 = torch.nonzero(input1_embedded).max(dim=0).values[1] + 1
        sent_length2 = torch.nonzero(input2_embedded).max(dim=0).values[1] + 1
        print(f"sent length1: {sent_length1} AND sent length2: {sent_length2}")
        # Embedding
        #input1_embedded = self.embedding(input1)
        #input2_embedded = self.embedding(input2)

        # Convolution
        #input1_conv = [F.relu(conv(input1_embedded.permute(0, 2, 1))) for conv in self.conv_layers]
        #input2_conv = [F.relu(conv(input2_embedded.permute(0, 2, 1))) for conv in self.conv_layers]

        input1_embedded = self.conv1(input1_embedded.permute(0, 2, 1))
        input2_embedded = self.conv1(input2_embedded.permute(0, 2, 1))
        #print(f"output of convolution: {input1_embedded.shape}")

        input1_embedded = torch.relu(input1_embedded)
        input2_embedded = torch.relu(input2_embedded)

        input1_embedded = self.kmaxPool1d(input1_embedded)
        input2_embedded = self.kmaxPool1d(input2_embedded)
        input1_embedded = input1_embedded.view(input1_embedded.shape[0], input1_embedded.shape[1] * input1_embedded.shape[2])
        input2_embedded = input2_embedded.view(input2_embedded.shape[0], input2_embedded.shape[1] * input2_embedded.shape[2])

        #kmax_input1_embedded = self.kmaxPool1d(input1_embedded)
        #input1_embedded = F.max_pool1d(input1_embedded, input1_embedded.shape[2]).squeeze(2)
        #nput2_embedded = F.max_pool1d(input2_embedded, input2_embedded.shape[2]).squeeze(2)
        #print(f"output of max pool: {input1_embedded.shape}")
        #print(f"output of kmax pool: {kmax_input1_embedded.shape}")

        input1_embedded = self.fc1(input1_embedded)
        input2_embedded = self.fc1(input2_embedded)
        #print(input1_embedded.shape)

        man_dist = torch.sum(torch.abs(input1_embedded - input2_embedded), axis=1)
        # sentence1_mean = torch.mean(x1, axis=1)
        # sentence2_mean = torch.mean(x2, axis=1)
        # man_dist = torch.sum(torch.abs(sentence1_mean - sentence2_mean), axis=1)
        # print(man_dist.shape)

        return torch.exp(-man_dist)


        #input2_embedded = self.conv1(input2_embedded.permute(0, 2, 1))


        # Max pooling
        # input1_pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in input1_conv]
        # input2_pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in input2_conv]
        #
        # # Concatenate and flatten
        # input1_concat = torch.cat(input1_pooled, dim=1)
        # input2_concat = torch.cat(input2_pooled, dim=1)
        #
        # # Concatenate the two sentence representations
        # sentence_similarity = torch.cat([input1_concat, input2_concat], dim=1)
        #
        # # Dense layers
        # #sentence_similarity = self.dropout(F.relu(self.fc1(sentence_similarity)))
        # #sentence_similarity = self.fc2(sentence_similarity)
        #
        # return torch.sigmoid(sentence_similarity)

In [174]:
vocab_size = len(word2vec.wv)
embedding_dim = 50
num_filters = 64
filter_size = 3
hidden_dim = 50
dropout = 0.5

model2 = SentenceSimilarityCNN2(embedding_dim, num_filters, filter_size, hidden_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model2.parameters(), lr=1e-1)
n_epochs=100
def train2(train_loader, n_epochs=n_epochs):
    model2.train()
    for epoch in range(n_epochs):
        curr_epoch_loss = []
        for x1, x2, y in train_loader:
            #print(x1.shape)
            y_hat = model2(x1, x2)
            loss = criterion(y_hat, y.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            curr_epoch_loss.append(loss.cpu().data.numpy())

        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")

    return model2
start_time = datetime.datetime.now()

model2 = train2(train_dataloader)

end_time = datetime.datetime.now()
print("Number of epochs: ", n_epochs)
print ("Training dataset size", len(train_dataset))
print("Training time: ", (end_time - start_time))

sent length1: 14 AND sent length2: 14
sent length1: 16 AND sent length2: 18
sent length1: 15 AND sent length2: 15
sent length1: 14 AND sent length2: 16
sent length1: 15 AND sent length2: 15
sent length1: 16 AND sent length2: 16
sent length1: 15 AND sent length2: 16
sent length1: 16 AND sent length2: 15
sent length1: 19 AND sent length2: 16
sent length1: 14 AND sent length2: 16
sent length1: 16 AND sent length2: 16
sent length1: 16 AND sent length2: 15
sent length1: 17 AND sent length2: 17
sent length1: 16 AND sent length2: 17
sent length1: 16 AND sent length2: 17
sent length1: 14 AND sent length2: 18
sent length1: 15 AND sent length2: 17
sent length1: 18 AND sent length2: 16
sent length1: 15 AND sent length2: 17
sent length1: 16 AND sent length2: 17
sent length1: 16 AND sent length2: 16
sent length1: 17 AND sent length2: 16
sent length1: 18 AND sent length2: 16
sent length1: 16 AND sent length2: 17
sent length1: 17 AND sent length2: 15
sent length1: 16 AND sent length2: 16
sent length1

KeyboardInterrupt: 

In [170]:
def eval_model2(model, test_dataloader):
    model.eval()
    Y_pred = []
    Y = []
    for x1, x2, y in test_dataloader:
        y_hat = model(x1, x2)
        #print(y_hat)
        y_pred = torch.zeros(y_hat.shape)
        y_pred = (y_hat > 0.5).int()

        Y_pred = np.concatenate((Y_pred, y_pred), axis=0)
        Y = np.concatenate((Y, y), axis=0)

        #print(y_pred)
        #print(y)
    return Y_pred, Y

y_pred, y = eval_model2(model2, test_dataloader)

from sklearn.metrics import accuracy_score
print("size of test corpus = ", len(test_dataset))
print("accuracy = ", accuracy_score(y, y_pred))

size of test corpus =  1725
accuracy =  0.6863768115942029


 bn_Testing Similarity Scoring Function_

In [170]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
OUTPUT_SIZE = 6

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, OUTPUT_SIZE))
scores = manhattan_similarity_score(test_embedding)

assert scores.shape == (NUM_OF_SAMPLES, 1)


In [171]:
class SSCN(nn.Module):
    def __init__(self, sample_size, stride=1, kernel_size=3, padding=1):
        super().__init__()
        self.stride = stride
        self.kernel_size = kernel_size
        self.padding = padding
        self.conv_layers =sample_size

        #NN layers
        self.conv1 = nn.Conv1d(in_channels=self.conv_layers, out_channels=self.conv_layers, \
                               kernel_size=self.kernel_size, padding=self.padding)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv1d(in_channels=self.conv_layers, out_channels=self.conv_layers, \
                               kernel_size=self.kernel_size, padding=self.padding)
        self.relu2 = nn.ReLU()
        self.pool1 = DynamicKMaxPooling(self.kernel_size, self.conv_layers)

        self.sscn = nn.Sequential(self.conv1, self.relu1, self.conv2, self.relu2, self.pool1)

    """
    * X: Pooled output of SSCN model of shape (sample_size, -1)
    * For the purpose of this experiment sample_size = 2
    """
    def manhattan_similarity_score(self, X):
        score = manhattan_similarity_score(X)
        return score

    def forward(self, X):
        # X = self.conv1(X)
        # print(X.shape)
        # X = self.relu1(X)
        # print(X.shape)
        # X = self.conv1(X)
        # print(X.shape)
        # X = self.relu2(X)
        # print(X.shape)
        # X = self.pool1(X)
        # print(X.shape)
        # X = self.manhattan_similarity_score(X)
        # print(X.shape)
        X = self.manhattan_similarity_score(self.sscn(X))
        return X

__Testing:__

In [172]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
UNIQUE_FEATURES = 18

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, UNIQUE_FEATURES))

model = SSCN(SAMPLE_SIZE)
# shape (batch,sample,sentence,word)?
print(model)

out = model(test_embedding)

assert out.shape[0] == NUM_OF_SAMPLES
assert out.shape[1] == 1
print(out.shape)


SSCN(
  (conv1): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu1): ReLU()
  (conv2): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool1): DynamicKMaxPooling()
  (sscn): Sequential(
    (0): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
    (3): ReLU()
    (4): DynamicKMaxPooling()
  )
)
torch.Size([20, 1])
