In [4]:
import datetime
import gensim
import numpy as np
import os.path
import pandas as pd
import torch, stanza
from torch import nn
import torch.nn.functional as F
import threading


# Data Processing

This section can have stuff related to data prep.


Should the MSPC Dataset be a part of this section? - Adam

In [5]:
def read_file(file_name):
    # Note: Unable to use pd.read_csv... the function complained about an issue with the formatting of the tsv file
    # train = pd.read_csv('data/msr_paraphrase_train.txt', sep='\t', encoding='latin1')
    # train

    # opting to read file in and split columns manually to create a pandas dataframe
    list = []
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            fields = line.split('\t')
            list.append(fields)

    df = pd.DataFrame(list[1:], columns=['Quality', 'ID1', 'ID2', 'String1', 'String2'])
    return df


# Model Definition

![Model Overview](./images/overview.png)


## Input Layer

In the input layer is made up of a Stanford Parser to provide a syntactic tree so that the model can extract significant words (mainly, subject, predicate, object) in the input corpus. Word2Vec is then used to map the words into vectors

In [6]:
# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None)

def trunk_construction(str, parent_label = None):
    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = construct_sentence(tree, parent_label)
    return ' '.join(words)

def construct_sentence(tree, parent_label = None, leave_pos=False):

    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + construct_sentence(child, tree.label)

    return sentences

2023-04-07 10:27:31 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-07 10:27:31 INFO: Using device: cpu
2023-04-07 10:27:31 INFO: Loading: tokenize
2023-04-07 10:27:31 INFO: Loading: pos
2023-04-07 10:27:32 INFO: Loading: constituency
2023-04-07 10:27:32 INFO: Done loading processors!


In [7]:
def test_parser(str, valid_sentence):

    new_sentence = trunk_construction(str)
    #new_sentence = ' '.join(words)
    assert new_sentence == valid_sentence

## Parser Test Cases
Test the parser using some of the training data sentences as input and asserting the output sentence matches the algorithm defined in the paper.

In [8]:
test_parser('Syrian forces launch new attacks', "forces launch attacks")
test_parser("""the flat tire was replaced by the driver""","tire was replaced driver")
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
           "Amrozi accused brother called witness distorting evidence")
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            "Shares Genentech company products market rose percent")
test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
             "Gyorgy Heizler head disaster unit said coach was carrying passengers")
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
           "Referring witness Amrozi accused brother distorting evidence")
test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
            "wife said was percent George Bush looked using years training war")

## Concurrency Parsing
Added support for concurrent parsing.  This can help in the performance of the preprocessing

In [9]:
class SentenceProcessingThread(threading.Thread):
    def __init__(self, sentences, output_list, begin, end):
        super(SentenceProcessingThread, self).__init__()
        self.sentences = sentences
        self.nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None, use_gpu=True)
        self.output_list = output_list
        self.begin = begin
        self.end = end

    def trunk_construction(self, str, parent_label = None):
        doc = self.nlp(str)
        tree = doc.sentences[0].constituency

        words = construct_sentence(tree, parent_label)
        return ' '.join(words)

    def run(self):
        print(f"going to process {self.begin} to {self.end}")
        for i, sentence in enumerate(self.sentences):
            new_sentence = trunk_construction(sentence)
            self.output_list[self.begin + i] = new_sentence

def process_sentences_concurrently(sentences, output, p=2):
    total = len(sentences)
    interval = int(total / p)
    threads = []
    for i in range(p):
        s = i*interval
        if i == p-1:
            e = total
        else:
            e = (i+1) * interval
        sentences_slice = sentences[s:e]
        sentence_thread = SentenceProcessingThread(sentences_slice, output, s, e)
        sentence_thread.start()
        threads.append(sentence_thread)

    for thread in threads:
        thread.join()

def preprocess_corpus(input_file='data/msr_paraphrase_train.txt', output_file='data/msr_paraphrase_train_stanza.txt', N=None):
    if os.path.exists(output_file):
        print(f"{output_file} already exists")
        return

    starttime = datetime.datetime.now()
    df = read_file(input_file)

    if N is None:
        N = len(df.String1)

    output1 = [None] * N
    output2 = [None] * N

    # we can process with more threads if we only have CPU
    p = 8

    if torch.cuda.is_available():
        # if cuda is available we don't need that many threads
        # and if the number of threads is set too large using cuda
        # we can get out of memory exceptions
        p = 2
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String1[:N], output1, p)

    # try and be careful with gpu memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String2[:N], output2, p)

    endtime = datetime.datetime.now()

    print(f"time to process {N*2} sentences is {endtime - starttime}")

    stanza_df = df[:N]

    processed_string1 = pd.Series(output1)
    processed_string1.apply(gensim.utils.simple_preprocess)
    processed_string2 = pd.Series(output2)
    processed_string2.apply(gensim.utils.simple_preprocess)

    stanza_df.String1 = processed_string1
    stanza_df.String2 = processed_string2

    # write the file out.  This can help in the future
    stanza_df.to_csv(output_file, sep="\t")


## Sentence Preprocessing
pass the input sentences from the training dataset through the stanford/stanza parser, extracting the relevant parts of speech and then tokenize the processed sentences using the gensim.utils.simple_preprocess utility

In [10]:
# start_time = datetime.datetime.now()
#
# processed_string1 = df[:500].String1.apply(trunk_construction)
# processed_string2 = df[:500].String2.apply(trunk_construction)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with stanza library took {end_time - start_time}")
#
# start_time = datetime.datetime.now()
#
# processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
# processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
# print(f"Number of sentences processed in the String1 column: len(processed_string1")
# print(f"Number of sentences processed in the String2 column: len(processed_string2")

# Word2Vec Embeddings
Take the preprocessed and tokenized sentences and use Word2Vec to get the word embeddings.  Take each word embedding in a sentence and find the mean which will represent the embedding for the sentence.

In [11]:
# from gensim.models import Word2Vec
#
# corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)
#
# #model = Word2Vec(sentences=corpus, vector_size=(len(processed_string1) + len(processed_string2)), min_count=2)
# # set vector size to reduce computational complexity
# model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)
# #model.build_vocab(sentences=corpus)
# #model.train(corpus, total_examples=model.corpus_count, epochs=5)
#
# print(model)
# print(model.wv.key_to_index)
#
# model.wv.get_vector('president')
#
# # for index, word in enumerate(model.wv.index_to_key):
# #     if index == 120:
# #         break
# #     print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

In [12]:
from gensim.models import Word2Vec

# Function is broken out for testing purposes
def generate_word2vec_model(corpus):
    # Creating the Word2Vec model
    model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)

    return model


In [13]:
# Function is broken out for testing purposes
def sentence_embeddings(w2v_model, sentence):
    list = []
    for word in sentence:
        list.append(w2v_model.wv.get_vector(word))

    word_matrix = np.row_stack(list)
    return np.mean(word_matrix, axis=0)

In [14]:
def test_word2vec():

    df = read_file('data/msr_paraphrase_train.txt')

    sentences1 = df.String1[:5].apply(gensim.utils.simple_preprocess)
    sentences2 = df.String2[:5].apply(gensim.utils.simple_preprocess)

    corpus = pd.concat([sentences1, sentences2], ignore_index=True)

    model = generate_word2vec_model(corpus)

    # Not sure if this will always generate the same embedding
    test_embedding = np.array([-0.00113049, -0.00124808,  0.00252251,  0.00058141,  0.00187964,  0.00379025,
              -0.00012356,  0.00347055, -0.00241507,  0.00545258, -0.00574078, -0.00489824,
              -0.00224492,  0.00744946,  0.00350835, -0.00139295, -0.00081134,  0.00655962,
              0.00244374, -0.00447209, -0.00124291, -0.00092616,  0.0021044,  -0.00092541,
              0.00284307,  0.00367638,  0.00364716,  0.00519976, -0.00088121,  0.00109841,
              -0.00219322, -0.00372483,  0.00078702, -0.00612309, -0.00312131,  0.00088071,
              0.00503909, -0.0009484,  -0.00068209, -0.0004782,   0.00367015,  0.00314679,
              -0.00302592,  0.00346377,  0.00151145, -0.00076442, -0.0012528,  -0.00087095,
              -0.00075365,  0.00468711])

    embedding = sentence_embeddings(model, corpus[0])

    assert embedding.shape == (50,)

    # not sure if this will always be equal based on comment on test_embedding variable
    assert np.allclose(embedding, test_embedding)

test_word2vec()

In [15]:
def corpus_embeddings(model, corpus):
    embeddings_list = []
    for sentence in corpus:
        embeddings_list.append(sentence_embeddings(model, sentence))

    return np.row_stack(embeddings_list)

In [16]:
from torch.utils.data import DataLoader, Dataset

# Dataset for the MSPC dataset
class MSPCDataset(Dataset):
    """
    Arguments:
        tsv_file (string): path to the tsv file with sentences to compare and associate quality score
        num_records (int): number of records to load.  Defaults to None which is all
    """
    def __init__(self, tsv_file, num_records=None):

        file_parts = os.path.splitext(tsv_file)
        output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
        preprocess_corpus(input_file=tsv_file, output_file=output_file)

        #df = read_file('data/msr_paraphrase_train.txt')
        df = pd.read_csv(output_file, sep="\t")

        if num_records is not None:
            processed_string1 = df[:num_records].String1
            processed_string2 = df[:num_records].String2
            self.quality = df[:num_records].Quality
        else:
            processed_string1 = df.String1
            processed_string2 = df.String2
            self.quality = df.Quality

        # start_time = datetime.datetime.now()
        #
        # if num_records is not None:
        #     processed_string1 = df[:num_records].String1.apply(trunk_construction)
        #     processed_string2 = df[:num_records].String2.apply(trunk_construction)
        #     self.quality = df[:num_records].Quality
        # else:
        #     processed_string1 = df.String1.apply(trunk_construction)
        #     processed_string2 = df.String2.apply(trunk_construction)
        #     self.quality = df.Quality
        #
        # end_time = datetime.datetime.now()
        #
        # print (f"Processing 200 sentences with stanza library took {end_time - start_time}")
        #
        # start_time = datetime.datetime.now()
        #
        # processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
        # processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)
        #
        # end_time = datetime.datetime.now()

        corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)

        w2v_model = generate_word2vec_model(corpus)

        sentence_embeddings1 = corpus_embeddings(w2v_model, processed_string1)
        sentence_embeddings2 = corpus_embeddings(w2v_model, processed_string2)



        self.sentences_embeddings1 = sentence_embeddings1
        self.sentences_embeddings2 = sentence_embeddings2

        # print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
        print(f"Number of sentences processed in the String1 column: {len(processed_string1)}")
        print(f"Number of sentences processed in the String2 column: {len(processed_string2)}")
        #print(self.sentences_embeddings1)



    def __len__(self):
        return len(self.sentences_embeddings1)

    def __getitem__(self, i):
        return self.sentences_embeddings1[i], self.sentences_embeddings2[i], self.quality[i]

In [27]:
def test_dataset():
    dataset = MSPCDataset('data/msr_paraphrase_train.txt', 10)
    print(len(dataset))
    assert len(dataset) == 10
    # for set in dataset:
    #     print(len(set[0]), len(set[1]))
    # print(len(dataset[0][0]))
    # print(dataset[0])
test_dataset()

data/msr_paraphrase_train_stanza.txt already exists
Number of sentences processed in the String1 column: 10
Number of sentences processed in the String2 column: 10
10


In [28]:
def conv_output_volume(W, F, S, P):

    """
    TODO: Given the input volume size $W$, the kernel/filter size $F$,
    the stride $S$, and the amount of zero padding $P$ used on the border,
    calculate the output volume size.
    Note the output should a integer.
    """

    # your code here
    #https://cs231n.github.io/convolutional-networks/

    return int((W-F+2*P)/S+1)

In [29]:
print(conv_output_volume(50, 3, 1, 1))

50


## Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [92]:
#https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8d
"""
TODO: Remove hardcoded values and max it dynamic.
"""
class DynamicKMaxPooling(nn.Module):
    def __init__(self, k_init, conv_layers, layer):
        super().__init__()
        # "L is the total  number  of  convolutional  layers
        # in  the  network;
        # ktop is the fixed pooling parameter for the
        # topmost  convolutional  layer"
        self.k_init = k_init
        self.conv_layers = conv_layers
        self.layer = layer

    def forward(self, X):
        s = 50
        dyn_k = ((self.conv_layers - self.layer) / self.conv_layers) * 3
        k_max = int(round(max(self.k_init, np.ceil(dyn_k))))
        print(k_max)
        out = F.max_pool1d(X, kernel_size=k_max)
        return out

## Sentence Similarity Convolution Network (SSCN)

In [101]:
class SSCN(nn.Module):
    def __init__(self, max_words, stride, kernel_size=3, padding=1):
        super().__init__()
        self.stride = stride
        self.kernel_size = kernel_size
        self.padding = padding
        self.max_words = max_words

        out_channel = conv_output_volume(self.max_words,self.kernel_size, self.stride, self.padding)

        #NN layers
        layers = {}

        self.conv1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=self.kernel_size, padding=self.padding)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=self.kernel_size, padding=self.padding)
        self.relu2 = nn.ReLU()
        self.pool1 = DynamicKMaxPooling(self.kernel_size, 2, 1)
        self.pool2 = DynamicKMaxPooling(self.kernel_size, 2, 2)


        #self.sscn = nn.Sequential(conv1, relu1, conv2, relu2)
        # self.pool1  = DynamicKMaxPooling(self.kernel_size, 2, 1)
        # self.pool2 = DynamicKMaxPooling(self.kernel_size, 2, 2)
        self.sscn = nn.Sequential(self.conv1, self.relu1, self.conv2, self.relu2, self.pool1, self.pool2)

    def forward(self, X):
        # out = self.conv1(X)
        # out = self.relu1(out)
        # out = self.conv2(out)
        # out = self.relu2(out)
        # out = self.pool1(out)
        # out = self.pool2(out)
        out = self.sscn(X)
        return out

__Testing:__

In [103]:
model = SSCN(50,1)

test_embedding = torch.FloatTensor([-0.00113049, -0.00124808,  0.00252251,  0.00058141,  0.00187964,  0.00379025,
              -0.00012356,  0.00347055, -0.00241507,  0.00545258, -0.00574078, -0.00489824,
              -0.00224492,  0.00744946,  0.00350835, -0.00139295, -0.00081134,  0.00655962,
              0.00244374, -0.00447209, -0.00124291, -0.00092616,  0.0021044,  -0.00092541,
              0.00284307,  0.00367638,  0.00364716,  0.00519976, -0.00088121,  0.00109841,
              -0.00219322, -0.00372483,  0.00078702, -0.00612309, -0.00312131,  0.00088071,
              0.00503909, -0.0009484,  -0.00068209, -0.0004782,   0.00367015,  0.00314679,
              -0.00302592,  0.00346377,  0.00151145, -0.00076442, -0.0012528,  -0.00087095,
              -0.00075365,  0.00468711]).reshape(1,50)

out = model(test_embedding)

print(out.shape)

********** X **********
torch.Size([1, 50])
tensor([[-0.0011, -0.0012,  0.0025,  0.0006,  0.0019,  0.0038, -0.0001,  0.0035,
         -0.0024,  0.0055, -0.0057, -0.0049, -0.0022,  0.0074,  0.0035, -0.0014,
         -0.0008,  0.0066,  0.0024, -0.0045, -0.0012, -0.0009,  0.0021, -0.0009,
          0.0028,  0.0037,  0.0036,  0.0052, -0.0009,  0.0011, -0.0022, -0.0037,
          0.0008, -0.0061, -0.0031,  0.0009,  0.0050, -0.0009, -0.0007, -0.0005,
          0.0037,  0.0031, -0.0030,  0.0035,  0.0015, -0.0008, -0.0013, -0.0009,
         -0.0008,  0.0047]])
********** conv1 **********
torch.Size([1, 50])
tensor([[0.3850, 0.3860, 0.3867, 0.3871, 0.3879, 0.3875, 0.3876, 0.3862, 0.3873,
         0.3855, 0.3829, 0.3823, 0.3867, 0.3893, 0.3879, 0.3856, 0.3875, 0.3889,
         0.3863, 0.3841, 0.3843, 0.3860, 0.3861, 0.3868, 0.3879, 0.3889, 0.3895,
         0.3880, 0.3867, 0.3853, 0.3839, 0.3843, 0.3834, 0.3826, 0.3839, 0.3873,
         0.3875, 0.3861, 0.3852, 0.3868, 0.3881, 0.3865, 0.3864, 0.38

### Sentence Similarity

 \begin{align*} Man(\vec V_{x}, \vec V_{y})=&\left |{ x_{1}-y_{1} }\right |\! +\! \left |{ x_{2}-y_{2} }\right | \!+ \!\ldots \!+ \!\left |{ x_{n}-y_{n} }\right |
 \\ score=&e^{-Man(\vec V_{x}, \vec V_{y})},\quad score\in [{0,1}] \end{align*}

In [None]:
from sklearn.metrics.pairwise import manhattan_distances

def ManhattanSimilarity(S,T):
    
