In [242]:
import datetime
import gensim
import numpy as np
import os.path
import pandas as pd
import torch, stanza
from torch import nn
import torch.nn.functional as F
import threading


# Data Processing

This section can have stuff related to data prep.


Should the MSPC Dataset be a part of this section? - Adam

In [243]:
def read_file(file_name):
    # Note: Unable to use pd.read_csv... the function complained about an issue with the formatting of the tsv file
    # train = pd.read_csv('data/msr_paraphrase_train.txt', sep='\t', encoding='latin1')
    # train

    # opting to read file in and split columns manually to create a pandas dataframe
    list = []
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            fields = line.split('\t')
            list.append(fields)

    df = pd.DataFrame(list[1:], columns=['Quality', 'ID1', 'ID2', 'String1', 'String2'])
    return df


# Model Definition

![Model Overview](./images/overview.png)


## Input Layer

In the input layer is made up of a Stanford Parser to provide a syntactic tree so that the model can extract significant words (mainly, subject, predicate, object) in the input corpus. Word2Vec is then used to map the words into vectors

In [358]:
# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None, use_gpu=False)

def trunk_construction(str, parent_label = None):
    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = construct_sentence(tree, parent_label)
    return ' '.join(words)

def construct_sentence(tree, parent_label = None, leave_pos=False):

    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + construct_sentence(child, tree.label)

    return sentences

def find_branches(tree, label, not_in_label=None, ancestors=[]):
    branches = []
    # print("-------------")
    # print(ancestors)
    # print(f"{tree.label} == {label}")
    if tree.label == label and not_in_label not in ancestors:
        # print(f"adding {tree}")
        branches.append(tree)
    for child in tree.children:
        branches = branches + find_branches(child, label, not_in_label, ancestors + [tree.label])

    return branches

#
# # According to the paper the subject is the first NN child of NP
def find_subject(noun_phrase_for_subject):
    subject = []
    for child in noun_phrase_for_subject.children:
        if 'NN' in child.label:
            subject = subject + child.leaf_labels()

    #print(f"subject = {subject}")
    #if len(subject) > 0:
    #    return ' '.join(subject)
    return subject

    return None

def find_predicate(verb_phrase_for_predicate):
    predicate = []
    for child in verb_phrase_for_predicate.children:
        if child.label.startswith('VB'):
            predicate = predicate + child.leaf_labels()

    if len(predicate) > 0:
        return ' '.join(predicate)

    return None

def find_object(verb_phase_for_object, parent='VP'):
    objects = []
    for child in verb_phase_for_object.children:
        if child.label == 'VP':
            continue
        if 'NN' in child.label and parent in ['NP', 'PP', 'ADJP']:
            #objects = objects + child.leaf_labels()
            new_objects = child.leaf_labels()
            for new_object in new_objects:
                if new_object not in objects:
                    objects.append(new_object)
        else:
            new_objects = find_object(child, child.label)
            #if new_objects not in objects and new_objects is not None:
            for new_object in new_objects:
                if new_object not in objects:
                    objects.append(new_object)
                #objects = objects + new_objects

    return objects
    # if len(objects) > 0:
    #     #return ' '.join(objects)
    #     return objects
    # else:
    #     return None

def find_spo(tree):
    noun_phrases_for_subject = find_branches(tree, label='NP', not_in_label='VP', ancestors=[])
    subject_list = []
    for noun_phrase_for_subject in noun_phrases_for_subject:
        subject = find_subject(noun_phrase_for_subject)
        #if subject is not None:
        #   subject_list.append(subject)
        subject_list = subject_list + subject

    verb_phrases = find_branches(tree, label='VP')
    predicate_list = []
    object_list = []
    for verb_phrase in verb_phrases:
        predicate = find_predicate(verb_phrase)
        if predicate is not None:
            predicate_list.append(predicate)
        object = find_object(verb_phrase)
        object_list = object_list + object
        #if object is not None:
        #    object_list.append(object)

    # dedupe list
    subject_list = list(dict.fromkeys(subject_list))
    predicate_list = list(dict.fromkeys(predicate_list))
    object_list = list(dict.fromkeys(object_list))

    return subject_list, predicate_list, object_list

2023-04-18 19:54:42 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 19:54:42 INFO: Using device: cpu
2023-04-18 19:54:42 INFO: Loading: tokenize
2023-04-18 19:54:42 INFO: Loading: pos
2023-04-18 19:54:43 INFO: Loading: constituency
2023-04-18 19:54:43 INFO: Done loading processors!


In [359]:
 def test_parser(str, valid_subject, valid_predicate, valid_object):

    #new_sentence = trunk_construction(str)
    #print(new_sentence)
    #assert new_sentence == valid_sentence

    doc = nlp(str)
    tree = doc.sentences[0].constituency

    subject_list, predicate_list, object_list = find_spo(tree)
    print(f"Subject = {' '.join(subject_list)}")
    print(f"Predicate = {' '.join(predicate_list)}")
    print(f"Object = {' '.join(object_list)}")

    print(f"{subject_list} = {valid_subject}")
    print(f"{predicate_list} = {valid_predicate}")
    print(f"{object_list} = {valid_object}")
    assert subject_list == valid_subject
    assert predicate_list == valid_predicate
    assert object_list == valid_object



## Parser Test Cases
Test the parser using some of the training data sentences as input and asserting the output sentence matches the algorithm defined in the paper.

In [360]:
test_parser(""""We have an incredible amount of work to do, but it is not in [designing new] instruction set architectures.""", [], ['have', 'do', 'is', 'designing'], ['amount', 'work', 'set', 'architectures'])
test_parser("""We have got an incredible amount of work to do, but it ain't in the instruction set," he said.""",  ['instruction', 'set'], ['have', 'got', 'do', 'said'], ['amount', 'work'])
test_parser('Syrian forces launch new attacks', ['forces'], ['launch'], ['attacks'])
test_parser("""the flat tire was replaced by the driver""", ['tire'], ['was', 'replaced'], ['driver'])
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
            ['Amrozi'], ['accused', 'called', 'distorting'], ['brother',  'witness', 'evidence'])
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
         ['Amrozi'], ['Referring', 'accused', 'distorting'], ['witness', 'brother', 'evidence'])
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            ['Shares', 'Genentech', 'company', 'products', 'market'], ['rose'], ['percent'])

test_parser("""Shares of Xoma fell 16 percent in early trade, while shares of Genentech, a much larger company with several products on the market, were up 2 percent.""", ['Shares', 'Xoma'], ['fell', 'were'], ['percent', 'trade', 'shares', 'Genentech', 'company', 'products', 'market'])

test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
              ['Gyorgy', 'Heizler', 'head', 'disaster', 'unit'], ['said', 'was', 'carrying'], ['coach', 'passengers'])
test_parser("""The head of the local disaster unit, Gyorgy Heizler, said the coach driver had failed to heed red stop lights.""",
            ['head', 'disaster', 'unit', 'Gyorgy', 'Heizler'], ['said', 'had', 'failed', 'heed'], ['coach', 'driver', 'stop', 'lights'])
# test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
#             "wife said was percent George Bush looked using years training war")
test_parser("""Sheena Young of Child, the national infertility support network, hoped the guidelines would lead to a more "fair and equitable" service for infertility sufferers""", ['Sheena', 'Young', 'Child', 'network'], ['hoped', 'lead'], ['guidelines', 'service', 'infertility', 'sufferers'])
test_parser("""Sheena Young, for Child, the national infertility support network, said the proposed guidelines should lead to a more "fair and equitable" service for infertility sufferers.""", ['Sheena', 'Young', 'Child', 'network'], ['said', 'lead'], ['guidelines', 'service', 'infertility', 'sufferers'])
#
# test_parser("""Among CNN viewers, 29 percent said they were Republicans and 36 percent called themselves conservatives.""",
#             "CNN viewers percent said were Republicans percent called conservatives")
# test_parser("""Out of Fox viewers, 41 percent describe themselves as Republicans, 24 percent as Democrats and 30 percent as Independents""",
#             "Fox viewers percent describe Republicans percent Democrats percent Independents")

# Note: stanza parser has a problem with the below sentence.  It is unable to parse it correctly
# test_parser("""Sheena Young, for Child, the national infertility support network, said the proposed guidelines should lead to a more "fair and equitable" service for infertility sufferers.""", "")
# test_parser("""Among Fox viewers, 41 percent describe themselves as Republicans, 24 percent as Democrats and 30 percent as Independents""", "Fox viewers percent describe Republicans percent Democrats percent Independents")

Subject = 
Predicate = have do is designing
Object = amount work set architectures
[] = []
['have', 'do', 'is', 'designing'] = ['have', 'do', 'is', 'designing']
['amount', 'work', 'set', 'architectures'] = ['amount', 'work', 'set', 'architectures']
Subject = instruction set
Predicate = have got do said
Object = amount work
['instruction', 'set'] = ['instruction', 'set']
['have', 'got', 'do', 'said'] = ['have', 'got', 'do', 'said']
['amount', 'work'] = ['amount', 'work']
Subject = forces
Predicate = launch
Object = attacks
['forces'] = ['forces']
['launch'] = ['launch']
['attacks'] = ['attacks']
Subject = tire
Predicate = was replaced
Object = driver
['tire'] = ['tire']
['was', 'replaced'] = ['was', 'replaced']
['driver'] = ['driver']
Subject = Amrozi
Predicate = accused called distorting
Object = brother witness evidence
['Amrozi'] = ['Amrozi']
['accused', 'called', 'distorting'] = ['accused', 'called', 'distorting']
['brother', 'witness', 'evidence'] = ['brother', 'witness', 'evidence

In [361]:
import spacy

spacy_nlp = spacy.load('en_core_web_sm')

def find_spacy_spo(doc):
    # Extract the subject, predicate, and object
    subject = []
    predicate = []
    obj = []

    for token in doc:
        #print(f"{token.dep_} : {token.text}")
        if "subj" in token.dep_:
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            subject .append(doc[start:end])
        elif "obj" in token.dep_ or "pcomp" in token.dep_:
            obj.append(token.text)
        elif "ROOT" in token.dep_ or "pred" in token.dep_:
            predicate.append(token.text)

    # Print the results
    # print("Subject: ", subject)
    # print("Predicate: ", predicate)
    # print("Object: ", obj)
    return subject, predicate, object

In [362]:
from enum import Enum
class Parsing(Enum):
    STANZA=1
    SPACY=2
    RAW=3

## Concurrency Parsing
Added support for concurrent parsing.  This can help in the performance of the preprocessing

In [363]:
class SentenceProcessingThread(threading.Thread):
    def __init__(self, sentences, output_list, begin, end, parsing_enum=Parsing.STANZA):
        super(SentenceProcessingThread, self).__init__()
        self.sentences = sentences
        self.parsing_enum = parsing_enum

        if parsing_enum == Parsing.STANZA:
            self.nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None, use_gpu=True)
        else:
            self.nlp = spacy.load('en_core_web_sm')
        self.output_list = output_list
        self.begin = begin
        self.end = end

    def trunk_construction(self, str, parent_label = None):
        doc = self.nlp(str)
        tree = doc.sentences[0].constituency

        #words = construct_sentence(tree, parent_label)
        #return ' '.join(words)

        if self.parsing_enum == Parsing.SPACY:
            subjects, predicates, objects = find_spacy_spo(tree)
        else:
            subjects, predicates, objects = find_spo(tree)

        return f"{' '.join(subjects)},{' '.join(predicates)},{' '.join(objects)}"

    def run(self):
        print(f"going to process {self.begin} to {self.end}")
        for i, sentence in enumerate(self.sentences):
            new_sentence = self.trunk_construction(sentence)
            self.output_list[self.begin + i] = new_sentence

def process_sentences_concurrently(sentences, output, p=2):
    total = len(sentences)
    interval = int(total / p)
    threads = []
    for i in range(p):
        s = i*interval
        if i == p-1:
            e = total
        else:
            e = (i+1) * interval
        sentences_slice = sentences[s:e]
        sentence_thread = SentenceProcessingThread(sentences_slice, output, s, e)
        sentence_thread.start()
        threads.append(sentence_thread)

    for thread in threads:
        thread.join()

def preprocess_corpus(input_file='data/msr_paraphrase_train.txt', output_file='data/msr_paraphrase_train_stanza.txt', N=None, parsing_enum=Parsing.STANZA):
    print(output_file)
    if os.path.exists(output_file):
        print(f"{output_file} already exists")
        return

    starttime = datetime.datetime.now()
    df = read_file(input_file)

    if N is None:
        N = len(df.String1)

    output1 = [None] * N
    output2 = [None] * N

    # we can process with more threads if we only have CPU
    p = 8

    if torch.cuda.is_available():
        # if cuda is available we don't need that many threads
        # and if the number of threads is set too large using cuda
        # we can get out of memory exceptions
        p = 2
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String1[:N], output1, p)

    # try and be careful with gpu memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String2[:N], output2, p)

    endtime = datetime.datetime.now()

    print(f"time to process {N*2} sentences is {endtime - starttime}")

    stanza_df = df[:N]

    processed_string1 = pd.Series(output1)
    # processed_string1.apply(gensim.utils.simple_preprocess)
    processed_string2 = pd.Series(output2)
    #processed_string2.apply(gensim.utils.simple_preprocess)

    stanza_df.String1 = processed_string1
    stanza_df.String2 = processed_string2

    # write the file out.  This can help in the future
    print(f"about to write out {output_file}")
    stanza_df.to_csv(output_file, sep="\t")


## Sentence Preprocessing
pass the input sentences from the training dataset through the stanford/stanza parser, extracting the relevant parts of speech and then tokenize the processed sentences using the gensim.utils.simple_preprocess utility

# Word2Vec Embeddings
Take the preprocessed and tokenized sentences and use Word2Vec to get the word embeddings.  Take each word embedding in a sentence and find the mean which will represent the embedding for the sentence.

In [250]:
from gensim.models import Word2Vec

# Function is broken out for testing purposes
def generate_word2vec_model(corpus):
    # Creating the Word2Vec model
    model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)

    return model


In [364]:
# Function is broken out for testing purposes
def sentence_embeddings(w2v_model, sentence, size):
    np_embedding = np.zeros(size)
    for i, word in enumerate(sentence):
        #print(word)
        np_embedding[i] = w2v_model.wv.get_vector(word)

    return np_embedding
    # list = []
    # for word in sentence:
    #     list.append(w2v_model.wv.get_vector(word))
    #
    # word_matrix = np.row_stack(list)
    # #return np.mean(word_matrix, axis=0)
    # return word_matrix

In [365]:
def test_word2vec():

    df = read_file('data/msr_paraphrase_train.txt')

    sentences1 = df.String1[:5].apply(gensim.utils.simple_preprocess)
    sentences2 = df.String2[:5].apply(gensim.utils.simple_preprocess)

    corpus = pd.concat([sentences1, sentences2], ignore_index=True)

    max_sentence_len = corpus.apply(len).max()

    model = generate_word2vec_model(corpus)

    embedding = sentence_embeddings(model, corpus[0], (max_sentence_len, 50))
    assert embedding.shape == (max_sentence_len, 50)

test_word2vec()

In [366]:
def init_word2vec(train_input_file, test_input_file, parsing_enum=Parsing.STANZA):

    if parsing_enum == Parsing.RAW:
        train_output_file = train_input_file
        test_output_file = test_input_file
        train_df = read_file(train_output_file)
        test_df = read_file(test_output_file)

    elif parsing_enum == Parsing.STANZA:
        file_parts = os.path.splitext(train_input_file)
        train_output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
        print("About to preprocess spacy data")
        preprocess_corpus(input_file=train_input_file, output_file=train_output_file, parsing_enum=parsing_enum)
        print("Done preprocessing spacy data")

        file_parts = os.path.splitext(test_input_file)
        test_output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
        print("About to preprocess data")
        preprocess_corpus(input_file=test_input_file, output_file=test_output_file, parsing_enum=parsing_enum)
        print("Done preprocessing data")
        train_df = pd.read_csv(train_output_file, sep="\t")
        test_df = pd.read_csv(test_output_file, sep="\t")
    else:
        file_parts = os.path.splitext(train_input_file)
        train_output_file = f"{file_parts[0]}_spacy{file_parts[1]}"
        print("About to preprocess spacy data")
        preprocess_corpus(input_file=train_input_file, output_file=train_output_file, parsing_enum=parsing_enum)
        print("Done preprocessing spacy data")

        file_parts = os.path.splitext(test_input_file)
        test_output_file = f"{file_parts[0]}_spacy{file_parts[1]}"
        print("About to preprocess data")
        preprocess_corpus(input_file=test_input_file, output_file=test_output_file, parsing_enum=parsing_enum)
        print("Done preprocessing data")
        train_df = pd.read_csv(train_output_file, sep="\t")
        test_df = pd.read_csv(test_output_file, sep="\t")

    # train_df = read_file(train)
    # test_df = read_file(test)

    train_sentences1 = train_df.String1.apply(gensim.utils.simple_preprocess)
    train_sentences2 = train_df.String2.apply(gensim.utils.simple_preprocess)
    test_sentences1 = test_df.String1.apply(gensim.utils.simple_preprocess)
    test_sentences2 = test_df.String2.apply(gensim.utils.simple_preprocess)

    corpus = pd.concat([train_sentences1, train_sentences2, test_sentences1, test_sentences2], ignore_index=True)
    max_sentence_len = corpus.apply(len).max()

    word2vec = generate_word2vec_model(corpus)


    return word2vec, max_sentence_len


In [367]:
def corpus_embeddings(model, corpus, max_sentence_len):
    corpus_size = len(corpus)
    embeddings_list = []
    embedding_matrix = np.zeros((corpus_size, max_sentence_len, 50))
    for i, sentence in enumerate(corpus):
        embeddings = sentence_embeddings(model, sentence, size=(max_sentence_len, 50))
        embedding_matrix[i] = embeddings
        embeddings_list.append(embeddings)

    return embedding_matrix

In [369]:
from torch.utils.data import DataLoader, Dataset

# Dataset for the MSPC dataset
class MSPCDataset(Dataset):
    """
    Arguments:
        tsv_file (string): path to the tsv file with sentences to compare and associate quality score
        num_records (int): number of records to load.  Defaults to None which is all
    """
    def __init__(self, tsv_file, w2v_model, max_sentence_length, num_records=None, parsing_enum=Parsing.STANZA):

        self.max_sentence_len = max_sentence_length
        self.w2v_model = w2v_model

        if parsing_enum == Parsing.STANZA:
            file_parts = os.path.splitext(tsv_file)
            output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
            print("About to preprocess stanza data")
            preprocess_corpus(input_file=tsv_file, output_file=output_file, parsing_enum=parsing_enum)
            print("Done preprocessing stanza data")
            #df = read_file('data/msr_paraphrase_train.txt')
            df = pd.read_csv(output_file, sep="\t")
        elif parsing_enum == Parsing.SPACY:
            file_parts = os.path.splitext(tsv_file)
            output_file = f"{file_parts[0]}_spacy{file_parts[1]}"
            print("About to preprocess spacy data")
            preprocess_corpus(input_file=tsv_file, output_file=output_file, parsing_enum=parsing_enum)
            print("Done preprocessing spacy data")
            #df = read_file('data/msr_paraphrase_train.txt')
            df = pd.read_csv(output_file, sep="\t")
        else:
            df = read_file(tsv_file)

        if num_records is not None:
            processed_string1 = df[:num_records].String1
            processed_string2 = df[:num_records].String2
            self.quality = df[:num_records].Quality
        else:
            processed_string1 = df.String1
            processed_string2 = df.String2
            self.quality = df.Quality

        if parsing_enum == Parsing.RAW:
            processed_string1 = processed_string1.apply(gensim.parsing.preprocessing.remove_stopwords)
            processed_string2 = processed_string2.apply(gensim.parsing.preprocessing.remove_stopwords)
            processed_string1 = processed_string1.apply(lambda x: gensim.parsing.preprocessing.strip_short(x, minsize=3))
            processed_string2 = processed_string2.apply(lambda x: gensim.parsing.preprocessing.strip_short(x, minsize=3))

        processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
        processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)


        print(processed_string1)


        #corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)
        #self.max_sentence_len = corpus.apply(len).max()
        #w2v_model = generate_word2vec_model(corpus)

        sentence_embeddings1 = corpus_embeddings(self.w2v_model, processed_string1, max_sentence_len=self.max_sentence_len)
        sentence_embeddings2 = corpus_embeddings(self.w2v_model, processed_string2, max_sentence_len=self.max_sentence_len)

        #self.w2v_model = w2v_model
        self.sentences_embeddings1 = sentence_embeddings1
        self.sentences_embeddings2 = sentence_embeddings2

        # print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
        print(f"Number of sentences processed in the String1 column: {len(processed_string1)}")
        print(f"Number of sentences processed in the String2 column: {len(processed_string2)}")
        #print(self.sentences_embeddings1)

    def __len__(self):
        return len(self.sentences_embeddings1)

    def __getitem__(self, i):
        #return torch.FloatTensor(np.stack((self.sentences_embeddings1[i], self.sentences_embeddings2[i]))), self.quality[i]
        return torch.FloatTensor(self.sentences_embeddings1[i]), torch.FloatTensor(self.sentences_embeddings2[i]), self.quality[i]

    def get_max_sentence_length(self):
        return self.max_sentence_len

In [370]:
def test_dataset():
    word2vec, max_sentence_length = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt', parsing_enum=Parsing.STANZA)
    dataset = MSPCDataset('data/msr_paraphrase_train.txt', word2vec, max_sentence_length, 10)
    assert len(dataset) == 10

    x1, y1, quality = dataset[0]
    assert x1.shape[0] == max_sentence_length
    assert x1.shape[1] == 50

test_dataset()

About to preprocess spacy data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing spacy data
About to preprocess data
data/msr_paraphrase_test_stanza.txt
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
About to preprocess stanza data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing stanza data
0    [amrozi, accused, called, distorting, brother,...
1    [yucaipa, owned, selling, dominick, chain, saf...
2    [had, published, offering, added, advertisemen...
3    [gmt, tab, shares, were, having, set, cents, h...
4    [stock, rose, close, percent, friday, stock, e...
5    [revenue, quarter, year, dropped, percent, per...
6        [nasdaq, had, closing, gain, percent, friday]
7          [dvd, cca, appealed, state, supreme, court]
8               [compared, cents, share, year, period]
9    [said, does, fit, business, company, growth, s...
Name: String

In [372]:
def test_dataset_raw_sentences():
    word2vec_raw, max_sentence_length_raw = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt', Parsing.RAW)
    dataset = MSPCDataset('data/msr_paraphrase_train.txt', word2vec_raw, max_sentence_length_raw, 10, parsing_enum=Parsing.RAW)
    assert len(dataset) == 10

    x1, y1, quality = dataset[0]
    assert x1.shape[0] == max_sentence_length_raw
    assert x1.shape[1] == 50

test_dataset_raw_sentences()

0    [amrozi, accused, brother, called, the, witnes...
1    [yucaipa, owned, dominick, selling, chain, saf...
2    [they, published, advertisement, internet, jun...
3    [around, gmt, tab, shares, cents, having, earl...
4    [the, stock, rose, percent, close, friday, new...
5    [revenue, quarter, year, dropped, percent, per...
6    [the, nasdaq, weekly, gain, percent, closing, ...
7     [the, dvd, cca, appealed, state, supreme, court]
8    [that, compared, million, cents, share, year, ...
9    [said, foodservice, pie, business, doesn, fit,...
Name: String1, dtype: object
Number of sentences processed in the String1 column: 10
Number of sentences processed in the String2 column: 10


In [373]:
def test_dataset_spacy_sentences():
    word2vec_spacy, max_sentence_length_spacy = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt', parsing_enum=Parsing.SPACY)
    dataset = MSPCDataset('data/msr_paraphrase_train.txt', word2vec_spacy, max_sentence_length_spacy, 10, parsing_enum=Parsing.SPACY)
    assert len(dataset) == 10

    x1, y1, quality = dataset[0]
    assert x1.shape[0] == max_sentence_length_spacy
    assert x1.shape[1] == 50

test_dataset_spacy_sentences()

2023-04-18 19:59:31 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 19:59:31 INFO: Using device: cuda
2023-04-18 19:59:31 INFO: Loading: tokenize


About to preprocess spacy data
data/msr_paraphrase_train_spacy.txt


2023-04-18 19:59:35 INFO: Loading: pos
2023-04-18 19:59:35 INFO: Loading: constituency
2023-04-18 19:59:36 INFO: Done loading processors!
2023-04-18 19:59:36 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 19:59:36 INFO: Using device: cuda
2023-04-18 19:59:36 INFO: Loading: tokenize
2023-04-18 19:59:36 INFO: Loading: pos


going to process 0 to 2038


2023-04-18 19:59:36 INFO: Loading: constituency
2023-04-18 19:59:37 INFO: Done loading processors!


going to process 2038 to 4076


2023-04-18 20:07:56 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 20:07:56 INFO: Using device: cuda
2023-04-18 20:07:56 INFO: Loading: tokenize
2023-04-18 20:07:56 INFO: Loading: pos
2023-04-18 20:07:57 INFO: Loading: constituency
2023-04-18 20:07:57 INFO: Done loading processors!
2023-04-18 20:07:57 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 20:07:57 INFO: Using device: cuda
2023-04-18 20:07:57 INFO: Loading: tokenize
2023-04-18 20:07:57 INFO: Loading: pos


going to process 0 to 2038


2023-04-18 20:07:58 INFO: Loading: constituency
2023-04-18 20:07:58 INFO: Done loading processors!


going to process 2038 to 4076


2023-04-18 20:16:23 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 20:16:23 INFO: Using device: cuda
2023-04-18 20:16:23 INFO: Loading: tokenize
2023-04-18 20:16:23 INFO: Loading: pos


time to process 8152 sentences is 0:16:51.955237
about to write out data/msr_paraphrase_train_spacy.txt
Done preprocessing spacy data
About to preprocess data
data/msr_paraphrase_test_spacy.txt


2023-04-18 20:16:23 INFO: Loading: constituency
2023-04-18 20:16:23 INFO: Done loading processors!
2023-04-18 20:16:23 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 20:16:23 INFO: Using device: cuda
2023-04-18 20:16:23 INFO: Loading: tokenize
2023-04-18 20:16:24 INFO: Loading: pos


going to process 0 to 862


2023-04-18 20:16:24 INFO: Loading: constituency
2023-04-18 20:16:25 INFO: Done loading processors!


going to process 862 to 1725


2023-04-18 20:20:16 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 20:20:16 INFO: Using device: cuda
2023-04-18 20:20:16 INFO: Loading: tokenize
2023-04-18 20:20:16 INFO: Loading: pos
2023-04-18 20:20:17 INFO: Loading: constituency
2023-04-18 20:20:17 INFO: Done loading processors!
2023-04-18 20:20:17 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-18 20:20:17 INFO: Using device: cuda
2023-04-18 20:20:17 INFO: Loading: tokenize
2023-04-18 20:20:17 INFO: Loading: pos


going to process 0 to 862


2023-04-18 20:20:18 INFO: Loading: constituency
2023-04-18 20:20:18 INFO: Done loading processors!


going to process 862 to 1725
time to process 3450 sentences is 0:07:27.730531
about to write out data/msr_paraphrase_test_spacy.txt
Done preprocessing data
About to preprocess spacy data
data/msr_paraphrase_train_spacy.txt
data/msr_paraphrase_train_spacy.txt already exists
Done preprocessing spacy data
0    [amrozi, accused, called, distorting, brother,...
1    [yucaipa, owned, selling, dominick, chain, saf...
2    [had, published, offering, added, advertisemen...
3    [gmt, tab, shares, were, having, set, cents, h...
4    [stock, rose, close, percent, friday, stock, e...
5    [revenue, quarter, year, dropped, percent, per...
6        [nasdaq, had, closing, gain, percent, friday]
7          [dvd, cca, appealed, state, supreme, court]
8               [compared, cents, share, year, period]
9    [said, does, fit, business, company, growth, s...
Name: String1, dtype: object
Number of sentences processed in the String1 column: 10
Number of sentences processed in the String2 column: 10


## Dataloaders
Create training and test dataloaders for sentences parsed with parts-of-speech parser

In [374]:
word2vec, max_sentence_length = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt')
train_dataset = MSPCDataset('data/msr_paraphrase_train.txt', word2vec, max_sentence_length)
test_dataset = MSPCDataset('data/msr_paraphrase_test.txt', word2vec, max_sentence_length)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

About to preprocess spacy data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing spacy data
About to preprocess data
data/msr_paraphrase_test_stanza.txt
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
About to preprocess stanza data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing stanza data
0       [amrozi, accused, called, distorting, brother,...
1       [yucaipa, owned, selling, dominick, chain, saf...
2       [had, published, offering, added, advertisemen...
3       [gmt, tab, shares, were, having, set, cents, h...
4       [stock, rose, close, percent, friday, stock, e...
                              ...                        
4071    [point, mr, brando, announced, put, continued,...
4072    [martin, be, freed, serving, today, thirds, se...
4073    [duisenberg, have, concluded, has, improved, l...
4074         [notification, was, reporte

In [375]:
word2vec_raw_sentences, max_sentence_length_raw_sentences = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt')
train_dataset_raw_sentences = MSPCDataset('data/msr_paraphrase_train.txt', word2vec_raw_sentences, max_sentence_length_raw_sentences)
test_dataset_raw_sentences = MSPCDataset('data/msr_paraphrase_test.txt', word2vec_raw_sentences, max_sentence_length_raw_sentences)

train_dataloader_raw_sentences = DataLoader(train_dataset_raw_sentences, batch_size=64, shuffle=False)
test_dataloader_raw_sentences = DataLoader(test_dataset_raw_sentences, batch_size=64, shuffle=False)

About to preprocess spacy data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing spacy data
About to preprocess data
data/msr_paraphrase_test_stanza.txt
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
About to preprocess stanza data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing stanza data
0       [amrozi, accused, called, distorting, brother,...
1       [yucaipa, owned, selling, dominick, chain, saf...
2       [had, published, offering, added, advertisemen...
3       [gmt, tab, shares, were, having, set, cents, h...
4       [stock, rose, close, percent, friday, stock, e...
                              ...                        
4071    [point, mr, brando, announced, put, continued,...
4072    [martin, be, freed, serving, today, thirds, se...
4073    [duisenberg, have, concluded, has, improved, l...
4074         [notification, was, reporte

In [376]:
word2vec_spacy_sentences, max_sentence_length_spacy_sentences = init_word2vec('data/msr_paraphrase_train.txt', 'data/msr_paraphrase_test.txt')
train_dataset_spacy_sentences = MSPCDataset('data/msr_paraphrase_train.txt', word2vec_spacy_sentences, max_sentence_length_spacy_sentences)
test_dataset_spacy_sentences = MSPCDataset('data/msr_paraphrase_test.txt', word2vec_spacy_sentences, max_sentence_length_spacy_sentences)

train_dataloader_spacy_sentences = DataLoader(train_dataset_spacy_sentences, batch_size=64, shuffle=False)
test_dataloader_spacy_sentences = DataLoader(test_dataset_spacy_sentences, batch_size=64, shuffle=False)

About to preprocess spacy data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing spacy data
About to preprocess data
data/msr_paraphrase_test_stanza.txt
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
About to preprocess stanza data
data/msr_paraphrase_train_stanza.txt
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing stanza data
0       [amrozi, accused, called, distorting, brother,...
1       [yucaipa, owned, selling, dominick, chain, saf...
2       [had, published, offering, added, advertisemen...
3       [gmt, tab, shares, were, having, set, cents, h...
4       [stock, rose, close, percent, friday, stock, e...
                              ...                        
4071    [point, mr, brando, announced, put, continued,...
4072    [martin, be, freed, serving, today, thirds, se...
4073    [duisenberg, have, concluded, has, improved, l...
4074         [notification, was, reporte

In [377]:
def conv_output_volume(W, F, S, P):

    """
    TODO: Given the input volume size $W$, the kernel/filter size $F$,
    the stride $S$, and the amount of zero padding $P$ used on the border,
    calculate the output volume size.
    Note the output should a integer.
    """

    # your code here
    #https://cs231n.github.io/convolutional-networks/

    return int((W-F+2*P)/S+1)

In [378]:
print(conv_output_volume(50, 3, 1, 1))

50


## Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [379]:
# #https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8dc
# """
# TODO: Remove hardcoded values and max it dynamic.
# """
# class DynamicKMaxPooling(nn.Module):
#     def __init__(self, k_init, conv_layers, layer):
#         super().__init__()
#         # "L is the total  number  of  convolutional  layers
#         # in  the  network;
#         # ktop is the fixed pooling parameter for the
#         # topmost  convolutional  layer"
#         self.k_init = k_init
#         self.conv_layers = conv_layers
#         self.layer = layer
#
#     def forward(self, X):
#         s = 50
#         dyn_k = ((self.conv_layers - self.layer) / self.conv_layers) * 3
#         k_max = int(round(max(self.k_init, np.ceil(dyn_k))))
#         print(k_max)
#         out = F.max_pool1d(X, kernel_size=k_max)
#         return out

## Sentence Similarity Convolution Network (SSCN)

### Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [380]:
#https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8dc
"""
TODO: Remove hardcoded values and max it dynamic.
"""
class DynamicKMaxPooling(nn.Module):
    def __init__(self, k_init, conv_layers):
        super().__init__()
        # "L is the total  number  of  convolutional  layers
        # in  the  network;
        # ktop is the fixed pooling parameter for the
        # topmost  convolutional  layer"
        self.k_init = k_init
        self.conv_layers = conv_layers

    def pool(self, X, l):
        # s is sequence length
        # l is current layer in network
        s = X.shape[2]
        dyn_k = ((self.conv_layers - l) / self.conv_layers) * s
        k_max = int(round(max(self.k_init, np.ceil(dyn_k))))
        return F.max_pool1d(X, kernel_size=k_max)

    def forward(self, X):
        for layer_i in range(self.conv_layers,0,-1):
            X = self.pool(X, layer_i)

        return X

Testing Dynamic K-Max Pooling Layer

In [381]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
OUTPUT_SIZE = 15

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, OUTPUT_SIZE))
dyn_k_layer = DynamicKMaxPooling(3, SAMPLE_SIZE)

# Call forward with convolution layer index [2,1]
out = dyn_k_layer(test_embedding)

assert out.shape[2] == 1
assert out.shape[1] == SAMPLE_SIZE
assert out.shape[0] == NUM_OF_SAMPLES

### Sentence Similarity

 \begin{align*} Man(\vec V_{x}, \vec V_{y})=&\left |{ x_{1}-y_{1} }\right |\! +\! \left |{ x_{2}-y_{2} }\right | \!+ \!\ldots \!+ \!\left |{ x_{n}-y_{n} }\right |
 \\ score=&e^{-Man(\vec V_{x}, \vec V_{y})},\quad score\in [{0,1}] \end{align*}

In [382]:
"""
* X: Pooled output of SSCN model of shape (sample_size, -1)
* For the purpose of this experiment sample_size = 2
"""
def manhattan_similarity_score(X):
    sample_count, _, M = X.shape
    Vx = X[:,0].reshape((sample_count,M))
    Vy = X[:,1].reshape((sample_count,M))
    mdist = torch.sum(torch.abs(Vx-Vy),dim=1).view(sample_count,-1)
    score = torch.exp(-1*mdist)
    return score

In [383]:
class KMaxPool1d(nn.Module):
    def __init__(self, k):
        super(KMaxPool1d, self).__init__()
        self.k = k

    def forward(self, x):
        # input shape (batch_size, num_channels, sequence_length)
        # output shape (batch_size num_channels, k)
        k_max_values, k_max_indices  = torch.topk(x, self.k, dim=2)
        return k_max_values

In [384]:
class DynamicKMaxPoolId(nn.Module):
    def __init__(self, k, l, L):
        super(DynamicKMaxPoolId, self).__init__()
        self.k = k
        self.l = l
        self.L = L

    def forward(self, x, sentence_length):
        ktop = max(self.k, int((self.L - self.l)/self.L * sentence_length))
        #print(f"ktop: {ktop}")
        k_max_values, k_max_indices = torch.topk(x, ktop, dim=2)
        return k_max_values


In [480]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentenceSimilarityCNN2(nn.Module):
    def __init__(self, embedding_dim, num_filters, filter_size, hidden_dim, dropout_prob=0.5):
        super(SentenceSimilarityCNN2, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_size, padding=1)
        self.conv2 = nn.Conv1d(in_channels= num_filters, out_channels=num_filters * 2, kernel_size=filter_size, padding=1)
        self.conv3 = nn.Conv1d(in_channels= num_filters * 2, out_channels=num_filters * 3, kernel_size=filter_size, padding=1)

        self.pool1 = DynamicKMaxPoolId(k=3, l=1, L=3)

        self.pool2 = DynamicKMaxPoolId(k=3, l=2, L=3)

        self.k = 3
        self.kmaxPool1d = KMaxPool1d(k=self.k)

        #self.fc1 = nn.Linear(self.k * num_filters , hidden_dim)
        self.fc1 = nn.Linear(self.k * num_filters * 3, hidden_dim)

    def forward(self, input1_embedded, input2_embedded):

        # input: input1_embedded is sentence1 and input2_embedding is sentence2
        # input shape: (batch_size, max_sentence_length, embedding_size)
        # output shap: (50) dimension vector that represents the sentence
        # output: similarity score

        # Find the sentence lengths
        sent_length1 = (torch.nonzero(input1_embedded).max(dim=0).values[1] + 1).item()
        sent_length2 = (torch.nonzero(input2_embedded).max(dim=0).values[1] + 1).item()
        #print(f"sent length1: {sent_length1} AND sent length2: {sent_length2}")

        # Convolution
        #print(input1_embedded.shape)
        # input shape (batch_size, max_sentence_length, embedding_size)
        # permuted shape (batch_size, embedding_size, max_sentence_length)
        # output shape (batch_size, out_channels, (max_sentence_length - kernel_size + 2*padding)/stride + 1
        # i.e. if max_sentence_length=19 and kernel_size=3 and padding=1 and stride=1 and out_channels=64
        #      output_shape (19-3+2*1)/1 + 1 = 19 => (64, 64, 19)
        input1_embedded = self.conv1(input1_embedded.permute(0, 2, 1))
        input2_embedded = self.conv1(input2_embedded.permute(0, 2, 1))
        #print(f"output of conv1: {input1_embedded.shape}")

        input1_embedded = torch.relu(input1_embedded)
        input2_embedded = torch.relu(input2_embedded)

        # pool1 is dynamic k-max pooling and is a function of the following formula
        # max(k-top, (L-l)/l * |s|) where L is total number of convolutional layers, l is the current convolution layer and |s| is sentence length k-top is the
        # k-top important features and serves as a lower bound... we will always try and find at least k-top features
        input1_embedded = self.pool1(input1_embedded, sent_length1)
        input2_embedded = self.pool1(input2_embedded, sent_length2)
        #print(f"output of pool1: {input1_embedded.shape}")

        input1_embedded = self.conv2(input1_embedded)
        input2_embedded = self.conv2(input2_embedded)
        #print(f"output of conv2: {input1_embedded.shape}")

        input1_embedded = torch.relu(input1_embedded)
        input2_embedded = torch.relu(input2_embedded)

        input1_embedded = self.pool2(input1_embedded, sent_length1)
        input2_embedded = self.pool2(input2_embedded, sent_length2)
        #print(f"output of pool2: {input1_embedded.shape}")

        input1_embedded = self.conv3(input1_embedded)
        input2_embedded = self.conv3(input2_embedded)
        #print(f"output of conv3: {input1_embedded.shape}")

        input1_embedded = torch.relu(input1_embedded)
        input2_embedded = torch.relu(input2_embedded)

        input1_embedded = self.kmaxPool1d(input1_embedded)
        input2_embedded = self.kmaxPool1d(input2_embedded)
        #print(f"output of k-max pool: {input1_embedded.shape}")

        input1_embedded = input1_embedded.view(input1_embedded.shape[0], input1_embedded.shape[1] * input1_embedded.shape[2])
        input2_embedded = input2_embedded.view(input2_embedded.shape[0], input2_embedded.shape[1] * input2_embedded.shape[2])

        #kmax_input1_embedded = self.kmaxPool1d(input1_embedded)
        #input1_embedded = F.max_pool1d(input1_embedded, input1_embedded.shape[2]).squeeze(2)
        #nput2_embedded = F.max_pool1d(input2_embedded, input2_embedded.shape[2]).squeeze(2)
        #print(f"output of max pool: {input1_embedded.shape}")
        #print(f"output of kmax pool: {kmax_input1_embedded.shape}")

        input1_embedded = self.fc1(input1_embedded)
        input2_embedded = self.fc1(input2_embedded)
        #print(input1_embedded.shape)

        man_dist = torch.sum(torch.abs(input1_embedded - input2_embedded), axis=1)
        # sentence1_mean = torch.mean(x1, axis=1)
        # sentence2_mean = torch.mean(x2, axis=1)
        # man_dist = torch.sum(torch.abs(sentence1_mean - sentence2_mean), axis=1)
        # print(man_dist.shape)

        return torch.exp(-man_dist)


        #input2_embedded = self.conv1(input2_embedded.permute(0, 2, 1))


        # Max pooling
        # input1_pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in input1_conv]
        # input2_pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in input2_conv]
        #
        # # Concatenate and flatten
        # input1_concat = torch.cat(input1_pooled, dim=1)
        # input2_concat = torch.cat(input2_pooled, dim=1)
        #
        # # Concatenate the two sentence representations
        # sentence_similarity = torch.cat([input1_concat, input2_concat], dim=1)
        #
        # # Dense layers
        # #sentence_similarity = self.dropout(F.relu(self.fc1(sentence_similarity)))
        # #sentence_similarity = self.fc2(sentence_similarity)
        #
        # return torch.sigmoid(sentence_similarity)

In [488]:
vocab_size = len(word2vec.wv)
embedding_dim = 50
num_filters = 64
filter_size = 3
hidden_dim = 50
dropout = 0.5
n_epochs=150

def train(train_loader, n_epochs=n_epochs):
    model = SentenceSimilarityCNN2(embedding_dim, num_filters, filter_size, hidden_dim)
    criterion = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
    model.train()

    for epoch in range(n_epochs):
        curr_epoch_loss = []
        for x1, x2, y in train_loader:
            #print(x1.shape)
            y_hat = model(x1, x2)
            loss = criterion(y_hat, y.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            curr_epoch_loss.append(loss.cpu().data.numpy())

        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")

    return model

# Train stanza sentences
start_time = datetime.datetime.now()
model = train(train_dataloader)
end_time = datetime.datetime.now()
print("Number of epochs: ", n_epochs)
print ("Training dataset size", len(train_dataset))
print("Training time: ", (end_time - start_time))

# Train the raw sentences
start_time = datetime.datetime.now()
model_raw_sentences = train(train_dataloader_raw_sentences)
end_time = datetime.datetime.now()
print("Number of epochs: ", n_epochs)
print ("Training dataset size", len(train_dataset))
print("Training time: ", (end_time - start_time))

# Train spacy sentences
start_time = datetime.datetime.now()
model_spacy_sentences = train(train_dataloader_spacy_sentences)
end_time = datetime.datetime.now()
print("Number of epochs: ", n_epochs)
print ("Training dataset size", len(train_dataset))
print("Training time: ", (end_time - start_time))

Epoch 0: curr_epoch_loss=0.22881031036376953
Epoch 1: curr_epoch_loss=0.2196347564458847
Epoch 2: curr_epoch_loss=0.21538543701171875
Epoch 3: curr_epoch_loss=0.21288608014583588
Epoch 4: curr_epoch_loss=0.21000337600708008
Epoch 5: curr_epoch_loss=0.2080463171005249
Epoch 6: curr_epoch_loss=0.20674332976341248
Epoch 7: curr_epoch_loss=0.2055101841688156
Epoch 8: curr_epoch_loss=0.20370227098464966
Epoch 9: curr_epoch_loss=0.20260974764823914
Epoch 10: curr_epoch_loss=0.20119041204452515
Epoch 11: curr_epoch_loss=0.19992920756340027
Epoch 12: curr_epoch_loss=0.19951796531677246
Epoch 13: curr_epoch_loss=0.19870170950889587
Epoch 14: curr_epoch_loss=0.19799281656742096
Epoch 15: curr_epoch_loss=0.19702181220054626
Epoch 16: curr_epoch_loss=0.1961609125137329
Epoch 17: curr_epoch_loss=0.1960534155368805
Epoch 18: curr_epoch_loss=0.19516965746879578
Epoch 19: curr_epoch_loss=0.1937691867351532
Epoch 20: curr_epoch_loss=0.19254228472709656
Epoch 21: curr_epoch_loss=0.191432923078537
Epoch 

In [487]:
from sklearn.metrics import accuracy_score

def eval_model(model, test_dataloader):
    model.eval()
    Y_pred = []
    Y = []
    for x1, x2, y in test_dataloader:
        y_hat = model(x1, x2)
        #print(y_hat)
        y_pred = torch.zeros(y_hat.shape)
        y_pred = (y_hat > 0.3).int()

        Y_pred = np.concatenate((Y_pred, y_pred), axis=0)
        Y = np.concatenate((Y, y), axis=0)

        #print(y_pred)
        #print(y)
    return Y_pred, Y

y_pred, y = eval_model(model, test_dataloader)
y_pred_raw, y_raw = eval_model(model_raw_sentences, test_dataloader_raw_sentences)
y_pred_spacy, y_spacy = eval_model(model_spacy_sentences, test_dataloader_spacy_sentences)
print("size of pos test corpus = ", len(test_dataset))
print("accuracy for pos test corpus = ", accuracy_score(y, y_pred))

print("size of raw test corpus = ", len(test_dataset_raw_sentences))
print("accuracy for raw test corpus = ", accuracy_score(y_raw, y_pred_raw))

print("size of spacy test corpus = ", len(test_dataset_spacy_sentences))
print("accuracy for spacy test corpus = ", accuracy_score(y_spacy, y_pred_spacy))

size of pos test corpus =  1725
accuracy for pos test corpus =  0.6811594202898551
size of raw test corpus =  1725
accuracy for raw test corpus =  0.6776811594202898
size of spacy test corpus =  1725
accuracy for spacy test corpus =  0.6730434782608695


 bn_Testing Similarity Scoring Function_

In [170]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
OUTPUT_SIZE = 6

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, OUTPUT_SIZE))
scores = manhattan_similarity_score(test_embedding)

assert scores.shape == (NUM_OF_SAMPLES, 1)


In [171]:
class SSCN(nn.Module):
    def __init__(self, sample_size, stride=1, kernel_size=3, padding=1):
        super().__init__()
        self.stride = stride
        self.kernel_size = kernel_size
        self.padding = padding
        self.conv_layers =sample_size

        #NN layers
        self.conv1 = nn.Conv1d(in_channels=self.conv_layers, out_channels=self.conv_layers, \
                               kernel_size=self.kernel_size, padding=self.padding)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv1d(in_channels=self.conv_layers, out_channels=self.conv_layers, \
                               kernel_size=self.kernel_size, padding=self.padding)
        self.relu2 = nn.ReLU()
        self.pool1 = DynamicKMaxPooling(self.kernel_size, self.conv_layers)

        self.sscn = nn.Sequential(self.conv1, self.relu1, self.conv2, self.relu2, self.pool1)

    """
    * X: Pooled output of SSCN model of shape (sample_size, -1)
    * For the purpose of this experiment sample_size = 2
    """
    def manhattan_similarity_score(self, X):
        score = manhattan_similarity_score(X)
        return score

    def forward(self, X):
        # X = self.conv1(X)
        # print(X.shape)
        # X = self.relu1(X)
        # print(X.shape)
        # X = self.conv1(X)
        # print(X.shape)
        # X = self.relu2(X)
        # print(X.shape)
        # X = self.pool1(X)
        # print(X.shape)
        # X = self.manhattan_similarity_score(X)
        # print(X.shape)
        X = self.manhattan_similarity_score(self.sscn(X))
        return X

__Testing:__

In [172]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
UNIQUE_FEATURES = 18

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, UNIQUE_FEATURES))

model = SSCN(SAMPLE_SIZE)
# shape (batch,sample,sentence,word)?
print(model)

out = model(test_embedding)

assert out.shape[0] == NUM_OF_SAMPLES
assert out.shape[1] == 1
print(out.shape)


SSCN(
  (conv1): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu1): ReLU()
  (conv2): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool1): DynamicKMaxPooling()
  (sscn): Sequential(
    (0): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
    (3): ReLU()
    (4): DynamicKMaxPooling()
  )
)
torch.Size([20, 1])
