In [1]:
import datetime
import torch, stanza
from torch import nn
import numpy as np
import torch.nn.functional as F
import pandas as pd
import gensim


# Data Processing

This section can have stuff related to data prep.


Should the MSPC Dataset be a part of this section? - Adam

In [2]:
def read_file(file_name):
    # Note: Unable to use pd.read_csv... the function complained about an issue with the formatting of the tsv file
    # train = pd.read_csv('data/msr_paraphrase_train.txt', sep='\t', encoding='latin1')
    # train

    # opting to read file in and split columns manually to create a pandas dataframe
    list = []
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            fields = line.split('\t')
            list.append(fields)

    df = pd.DataFrame(list[1:], columns=['Quality', 'ID1', 'ID2', 'String1', 'String2'])
    return df


# Model Definition

![Model Overview](./images/overview.png)


## Input Layer

In the input layer is made up of a Stanford Parser to provide a syntactic tree so that the model can extract significant words (mainly, subject, predicate, object) in the input corpus. Word2Vec is then used to map the words into vectors

In [3]:
# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None)

def trunk_construction(str, parent_label = None):
    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = construct_sentence(tree, parent_label)
    return ' '.join(words)

def construct_sentence(tree, parent_label = None, leave_pos=False):

    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + construct_sentence(child, tree.label)

    return sentences

2023-04-04 12:20:32 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-04 12:20:32 INFO: Using device: cpu
2023-04-04 12:20:32 INFO: Loading: tokenize
2023-04-04 12:20:32 INFO: Loading: pos
2023-04-04 12:20:32 INFO: Loading: constituency
2023-04-04 12:20:33 INFO: Done loading processors!


In [4]:
def test_parser(str, valid_sentence):

    new_sentence = trunk_construction(str)
    #new_sentence = ' '.join(words)
    assert new_sentence == valid_sentence

## Parser Test Cases
Test the parser using some of the training data sentences as input and asserting the output sentence matches the algorithm defined in the paper.

In [77]:
test_parser('Syrian forces launch new attacks', "forces launch attacks")
test_parser("""the flat tire was replaced by the driver""","tire was replaced driver")
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
           "Amrozi accused brother called witness distorting evidence")
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            "Shares Genentech company products market rose percent")
test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
             "Gyorgy Heizler head disaster unit said coach was carrying passengers")
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
           "Referring witness Amrozi accused brother distorting evidence")
test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
            "wife said was percent George Bush looked using years training war")

## Sentence Preprocessing
pass the input sentences from the training dataset through the stanford/stanza parser, extracting the relevant parts of speech and then tokenize the processed sentences using the gensim.utils.simple_preprocess utility

In [6]:
# start_time = datetime.datetime.now()
#
# processed_string1 = df[:500].String1.apply(trunk_construction)
# processed_string2 = df[:500].String2.apply(trunk_construction)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with stanza library took {end_time - start_time}")
#
# start_time = datetime.datetime.now()
#
# processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
# processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
# print(f"Number of sentences processed in the String1 column: len(processed_string1")
# print(f"Number of sentences processed in the String2 column: len(processed_string2")

# Word2Vec Embeddings
Take the preprocessed and tokenized sentences and use Word2Vec to get the word embeddings.  Take each word embedding in a sentence and find the mean which will represent the embedding for the sentence.

In [7]:
# from gensim.models import Word2Vec
#
# corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)
#
# #model = Word2Vec(sentences=corpus, vector_size=(len(processed_string1) + len(processed_string2)), min_count=2)
# # set vector size to reduce computational complexity
# model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)
# #model.build_vocab(sentences=corpus)
# #model.train(corpus, total_examples=model.corpus_count, epochs=5)
#
# print(model)
# print(model.wv.key_to_index)
#
# model.wv.get_vector('president')
#
# # for index, word in enumerate(model.wv.index_to_key):
# #     if index == 120:
# #         break
# #     print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

In [14]:
from gensim.models import Word2Vec

# Function is broken out for testing purposes
def generate_word2vec_model(corpus):
    # Creating the Word2Vec model
    model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)

    return model


In [29]:
# Function is broken out for testing purposes
def sentence_embeddings(w2v_model, sentence):
    list = []
    for word in sentence:
        list.append(w2v_model.wv.get_vector(word))

    word_matrix = np.row_stack(list)
    return np.mean(word_matrix, axis=0)

In [61]:
def test_word2vec():

    df = read_file('data/msr_paraphrase_train.txt')

    sentences1 = df.String1[:5].apply(gensim.utils.simple_preprocess)
    sentences2 = df.String2[:5].apply(gensim.utils.simple_preprocess)

    corpus = pd.concat([sentences1, sentences2], ignore_index=True)

    model = generate_word2vec_model(corpus)

    # Not sure if this will always generate the same embedding
    test_embedding = np.array([-0.00113049, -0.00124808,  0.00252251,  0.00058141,  0.00187964,  0.00379025,
              -0.00012356,  0.00347055, -0.00241507,  0.00545258, -0.00574078, -0.00489824,
              -0.00224492,  0.00744946,  0.00350835, -0.00139295, -0.00081134,  0.00655962,
              0.00244374, -0.00447209, -0.00124291, -0.00092616,  0.0021044,  -0.00092541,
              0.00284307,  0.00367638,  0.00364716,  0.00519976, -0.00088121,  0.00109841,
              -0.00219322, -0.00372483,  0.00078702, -0.00612309, -0.00312131,  0.00088071,
              0.00503909, -0.0009484,  -0.00068209, -0.0004782,   0.00367015,  0.00314679,
              -0.00302592,  0.00346377,  0.00151145, -0.00076442, -0.0012528,  -0.00087095,
              -0.00075365,  0.00468711])

    embedding = sentence_embeddings(model, corpus[0])

    assert embedding.shape == (50,)

    # not sure if this will always be equal based on comment on test_embedding variable
    assert np.allclose(embedding, test_embedding)

test_word2vec()

In [66]:
def corpus_embeddings(model, corpus):
    embeddings_list = []
    for sentence in corpus:
        embeddings_list.append(sentence_embeddings(model, sentence))

    return np.row_stack(embeddings_list)

In [74]:
from torch.utils.data import DataLoader, Dataset

# Dataset for the MSPC dataset
class MSPCDataset(Dataset):
    """
    Arguments:
        tsv_file (string): path to the tsv file with sentences to compare and associate quality score
        num_records (int): number of records to load.  Defaults to None which is all
    """
    def __init__(self, tsv_file, num_records=None):
        df = read_file('data/msr_paraphrase_train.txt')

        start_time = datetime.datetime.now()

        if num_records is not None:
            processed_string1 = df[:num_records].String1.apply(trunk_construction)
            processed_string2 = df[:num_records].String2.apply(trunk_construction)
            self.quality = df[:num_records].Quality
        else:
            processed_string1 = df.String1.apply(trunk_construction)
            processed_string2 = df.String2.apply(trunk_construction)
            self.quality = df.Quality

        end_time = datetime.datetime.now()

        print (f"Processing 200 sentences with stanza library took {end_time - start_time}")

        start_time = datetime.datetime.now()

        processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
        processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)

        end_time = datetime.datetime.now()

        corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)

        w2v_model = generate_word2vec_model(corpus)

        sentence_embeddings1 = corpus_embeddings(w2v_model, processed_string1)
        sentence_embeddings2 = corpus_embeddings(w2v_model, processed_string2)



        self.sentences_embeddings1 = sentence_embeddings1
        self.sentences_embeddings2 = sentence_embeddings2

        print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
        print(f"Number of sentences processed in the String1 column: {len(processed_string1)}")
        print(f"Number of sentences processed in the String2 column: {len(processed_string2)}")
        #print(self.sentences_embeddings1)



    def __len__(self):
        return len(self.sentences_embeddings1)

    def __getitem__(self, i):
        return self.sentences_embeddings1[i], self.sentences_embeddings2[i], self.quality[i]

In [76]:
def test_dataset():
    dataset = MSPCDataset('data/msr_paraphrase_train.txt', 10)
    assert len(dataset) == 10
    # print(dataset[0])
test_dataset()

Processing 200 sentences with stanza library took 0:00:06.489651
Processing 200 sentences with gensim.utils.simple_preprocess took 0:00:00.000997
Number of sentences processed in the String1 column: 10
Number of sentences processed in the String2 column: 10
(array([-0.00198056,  0.00452067, -0.00300886, -0.00332817,  0.0005654 ,
       -0.00571383, -0.00130734,  0.00222749, -0.00271983,  0.00259559,
       -0.00042913, -0.00666595, -0.00520164,  0.00599408,  0.00242181,
        0.00821459,  0.00473668, -0.0004835 ,  0.00094157, -0.00775254,
        0.00124578,  0.00033029, -0.00147425,  0.00130682,  0.00093901,
        0.00575981, -0.00139975,  0.00036692,  0.00320067, -0.00132761,
       -0.00065307, -0.00116841,  0.00637745, -0.00275588,  0.00029436,
        0.0025276 ,  0.00983984,  0.00815548, -0.00666484, -0.0018606 ,
        0.00654633, -0.00289515,  0.00369345,  0.00664935,  0.00236211,
       -0.00282075, -0.00704209, -0.0010638 , -0.00042502,  0.00569323],
      dtype=float32)

## Input Layer

In [55]:
class inputLayer(nn.Module):
    def __init__(self, nlp_pipeline):
        super().__init__()
        self.nlp = nlp_pipeline

    def construct_sentence(self, tree, parent_label = None, leave_pos=False):
        sentences = []
        if 'NN' in tree.label:
            if parent_label == 'NP':
                # sentences.append(tree)
                sentences = sentences + tree.leaf_labels()
        if 'VB' in tree.label:
            if parent_label == 'VP':
                #sentences.append(tree)
                sentences = sentences + tree.leaf_labels()
        for child in tree.children:
            sentences = sentences + construct_sentence(child, tree.label)
        return sentences

    """
    Here we take the output of the Stanford Parser and pass it to
    Word2Vec in order to create a vector.
    """
    def embed_vector(self, words):
        pass

    def forward(self, input_sentence, parent_label = None):
        doc = self.nlp(input_sentence)
        tree = doc.sentences[0].constituency
        words = construct_sentence(tree, parent_label)
        out = ' '.join(words)
        return out

__Testing:__

In [56]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None)

def test_input_layer(str, valid_sentence):

    in_layer = inputLayer(nlp)
    new_sentence = in_layer(str)
    #new_sentence = ' '.join(words)
    assert new_sentence == valid_sentence

test_parser('Syrian forces launch new attacks', "forces launch attacks")
test_parser("""the flat tire was replaced by the driver""","tire was replaced driver")
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
           "Amrozi accused brother called witness distorting evidence")
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            "Shares Genentech company products market rose percent")
test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
             "Gyorgy Heizler head disaster unit said coach was carrying passengers")
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
           "Referring witness Amrozi accused brother distorting evidence")
test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
            "wife said was percent George Bush looked using years training war")

2023-04-03 13:51:34 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-03 13:51:34 INFO: Using device: cpu
2023-04-03 13:51:34 INFO: Loading: tokenize
2023-04-03 13:51:34 INFO: Loading: pos
2023-04-03 13:51:35 INFO: Loading: constituency
2023-04-03 13:51:36 INFO: Done loading processors!


## Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [57]:
#https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8dc

class DynamicKMaxPool(nn.Module):
    def __init__(self, k_top, L):
        super().__init__()
        # "L is the total  number  of  convolutional  layers
        # in  the  network;
        # ktop is the fixed pooling parameter for the
        # topmost  convolutional  layer"
        self.k_top = k_top
        self.L = L

    def forward(self, X, l):
        s = X.size(dim=2)
        dyn_k = ((self.L - l) / self.L) * s
        k_max = max(self.k_top, np.ceil(dyn_k))
        print(self.k_top, np.ceil(dyn_k))
        out = F.adaptive_avg_pool1d(X, k_max)
        return out

__Testing:__

In [58]:
X = torch.rand((3,3,3))
print(X)
dynMaxPool = DynamicKMaxPool(3,1)
print(dynMaxPool(X,1))

tensor([[[0.4922, 0.1095, 0.7784],
         [0.6618, 0.8248, 0.3079],
         [0.7897, 0.6921, 0.0027]],

        [[0.2509, 0.6090, 0.0118],
         [0.8836, 0.5662, 0.1784],
         [0.7990, 0.5158, 0.5854]],

        [[0.4951, 0.0020, 0.5179],
         [0.4915, 0.8771, 0.0695],
         [0.7113, 0.9584, 0.1733]]])
3 0.0
tensor([[[0.4922, 0.1095, 0.7784],
         [0.6618, 0.8248, 0.3079],
         [0.7897, 0.6921, 0.0027]],

        [[0.2509, 0.6090, 0.0118],
         [0.8836, 0.5662, 0.1784],
         [0.7990, 0.5158, 0.5854]],

        [[0.4951, 0.0020, 0.5179],
         [0.4915, 0.8771, 0.0695],
         [0.7113, 0.9584, 0.1733]]])


### Sentence Similarity

 \begin{align*} Man(\vec V_{x}, \vec V_{y})=&\left |{ x_{1}-y_{1} }\right |\! +\! \left |{ x_{2}-y_{2} }\right | \!+ \!\ldots \!+ \!\left |{ x_{n}-y_{n} }\right |
 \\ score=&e^{-Man(\vec V_{x}, \vec V_{y})},\quad score\in [{0,1}] \end{align*}