In [26]:
import torch, stanza
from torch import nn
import numpy as np
import torch.nn.functional as F

# Data Processing

This section can have stuff related to data prep.


Should the MSPC Dataset be a part of this section? - Adam

In [33]:
#Code here
pass

# Model Definition

![Model Overview](images/overview.png)


## Input Layer

In the input layer is made up of a Stanford Parser to provide a syntactic tree so that the model can extract significant words (mainly, subject, predicate, object) in the input corpus. Word2Vec is then used to map the words into vectors

In [32]:
    # set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None)

def trunk_construction(str, parent_label = None):
    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = construct_sentence(tree, parent_label)
    return ' '.join(words)

def construct_sentence(tree, parent_label = None, leave_pos=False):

    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + construct_sentence(child, tree.label)

    return sentences

IndentationError: expected an indented block (1778526601.py, line 5)

In [81]:
def test_parser(str, valid_sentence):

    new_sentence = trunk_construction(str)
    #new_sentence = ' '.join(words)
    assert new_sentence == valid_sentence

In [82]:
test_parser('Syrian forces launch new attacks', "forces launch attacks")
test_parser("""the flat tire was replaced by the driver""","tire was replaced driver")
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
           "Amrozi accused brother called witness distorting evidence")
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            "Shares Genentech company products market rose percent")
test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
             "Gyorgy Heizler head disaster unit said coach was carrying passengers")
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
           "Referring witness Amrozi accused brother distorting evidence")
test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
            "wife said was percent George Bush looked using years training war")


In [None]:
from torch.utils.data import DataLoader, Dataset

class MRPCDataset(Dataset):
    def __init__(self, string1, string2, quality):
        self.string1 = string1
        self.string2 = string2
        self.quality = quality

    def __len__(self):
        return len(self.string1)

    def __getitem__(self, index):
        return self.string1, self.string2, self.quality

In [83]:
import pandas as pd

def read_file(file_name):
    # Note: Unable to use pd.read_csv... the function complained about an issue with the formatting of the tsv file
    # train = pd.read_csv('data/msr_paraphrase_train.txt', sep='\t', encoding='latin1')
    # train

    # opting to read file in and split columns manually to create a pandas dataframe
    list = []
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            fields = line.split('\t')
            list.append(fields)

    df = pd.DataFrame(list[1:], columns=['Quality', 'ID1', 'ID2', 'String1', 'String2'])
    return df

In [None]:
#def collate_fn()

In [158]:
import gensim
import datetime

df = read_file('data/msr_paraphrase_train.txt')
start_time = datetime.datetime.now()

processed_string1 = df[:500].String1.apply(trunk_construction)
processed_string2 = df[:500].String2.apply(trunk_construction)

end_time = datetime.datetime.now()

print (f"Processing 200 sentences with stanza library took {end_time - start_time}")

start_time = datetime.datetime.now()

processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)

end_time = datetime.datetime.now()

print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")


Processing 200 sentences with stanza library took 0:06:09.133081
Processing 200 sentences with gensim.utils.simple_preprocess took 0:00:00.010970


In [159]:
len(processed_string1)

500

In [160]:
processed_string2

0      [referring, witness, amrozi, accused, brother,...
1             [yucaipa, bought, dominick, sold, safeway]
2      [june, ship, owners, had, published, advertise...
3       [tab, shares, jumped, cents, set, closing, high]
4      [shares, jumped, percent, stock, exchange, fri...
                             ...                        
495    [was, bitten, back, scratched, leg, mother, said]
496    [says, release, tarot, card, left, shooting, s...
497    [bryant, has, said, hike, had, effect, demand,...
498                        [yankees, are, slump, season]
499                     [solomon, is, canada, resellers]
Name: String2, Length: 500, dtype: object

In [161]:
from gensim.models import Word2Vec

corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)

#model = Word2Vec(sentences=corpus, vector_size=(len(processed_string1) + len(processed_string2)), min_count=2)
# set vector size to reduce computational complexity
model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)
#model.build_vocab(sentences=corpus)
#model.train(corpus, total_examples=model.corpus_count, epochs=5)

In [162]:
print(model)
print(model.wv.key_to_index)

model.wv.most_similar('president')

# for index, word in enumerate(model.wv.index_to_key):
#     if index == 120:
#         break
#     print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

Word2Vec<vocab=2899, vector_size=50, alpha=0.025>


[('altria', 0.4458131492137909),
 ('masters', 0.42778995633125305),
 ('groups', 0.42183011770248413),
 ('says', 0.4047095477581024),
 ('platinum', 0.4028638005256653),
 ('sentence', 0.40011686086654663),
 ('helicopter', 0.38941821455955505),
 ('hockey', 0.3876991271972656),
 ('deputies', 0.38709747791290283),
 ('suv', 0.3853173851966858)]

In [462]:
# Testing the spacy library to extract spo
# This is only test code and should not be uncommented.

# import spacy
#
# def get_spacy_subject_phrase(doc):
#     for token in doc:
#         if ("subj" in token.dep_):
#             return token
#             # subtree = list(token.subtree)
#             # start = subtree[0].i
#             # end = subtree[-1].i + 1
#             # return doc[start:end]
#
# def get_spacy_predicate_phrase(doc):
#     for token in doc:
#         if ("ROOT" in token.dep_):
#             subtree = list(token.subtree)
#             start = subtree[0].i
#             end = subtree[-1].i + 1
#             return token
#             # return doc[start:end]
#
# def get_spacy_object_phrase(doc):
#     for token in doc:
#         if ("dobj" in token.dep_):
#             return token
#             # subtree = list(token.subtree)
#             # start = subtree[0].i
#             # end = subtree[-1].i + 1
#             # return doc[start:end]
#
# def spacy_find_spo(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     sentence = next(doc.sents)
#     previous = None
#     new_sentence = []
#     for word in sentence:
#         print(f"{word} : {word.dep_}")
#         add_word = None
#         if "subj" in word.dep_:
#             add_word = word
#         if "ROOT" in word.dep_:
#             add_word = word
#         if "pobj" in word.dep_:
#             add_word = word
#         if "dobj" in word.dep_:
#             add_word = word
#         # if "prep" in word.dep_:
#         #     add_word = word
#         if "ccomp" in word.dep_:
#             add_word = word
#         if "pcomp" in word.dep_:
#             add_word = word
#         if add_word is not None:
#             if previous is not None and "compound" in previous.dep_:
#                 new_sentence.append(previous)
#             new_sentence.append(add_word)
#         previous = word
#     return new_sentence
#
# def find_spacy_subject(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     for token in doc:
#         # Check if the token is a verb and has a subject
#         if token.dep_ == "ROOT":
#             for child in token.children:
#                 if child.dep_ == "nsubj":
#                     #subject = child.text
#                     predicate = ' '.join(child.text for child in token.children)
#                 break
#     return predicate
#
# def find_spacy_object(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     for token in doc:
#         # Check if the token is a verb and has a subject
#         if token.dep_ == "ROOT":
#             for child in token.children:
#                 if child.dep_ == "nsubj":
#                     obj = child.text
#                 break
#     return obj
#
# #print(spacy_find_spo("Syrian forces launch new attack"))
# #print(spacy_find_spo("Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent"))
# #print(spacy_find_spo("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers."""))
# #print(spacy_find_spo("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence."""))
# #print(spacy_find_spo("""Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war."""))
# print(spacy_find_spo("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war."""))

His : poss
wife : nsubj
said : ROOT
he : nsubj
was : ccomp
" : punct
100 : nummod
percent : npadvmod
behind : prep
George : compound
Bush : pobj
" : punct
and : cc
looked : conj
forward : advmod
to : prep
using : pcomp
his : poss
years : dobj
of : prep
training : pobj
in : prep
the : det
war : pobj
. : punct
[wife, said, he, was, George, Bush, using, years, training, war]


## Convolution Layers

The model used 2 pooling layers I think

In [None]:
#Code here
pass

## Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Implementation:__

In [31]:
class DynamicKMaxPool(nn.Module):
    def __init__(self, k_top, L):
        super().__init__()
        # "L is the total  number  of  convolutional  layers
        # in  the  network;
        # ktop is the fixed pooling parameter for the
        # topmost  convolutional  layer"
        self.k_top = k_top
        self.L = L

    def forward(self, X, l):
        s = X.size(dim=2)
        dyn_k = ((self.L - l) / self.L) * s
        k_max = max(self.k_top, np.ceil(dyn_k))
        print(self.k_top, np.ceil(dyn_k))
        out = F.adaptive_avg_pool1d(X, k_max)
        return out

__Testing:__

In [30]:
X = torch.rand((3,3,3))
print(X)
dynMaxPool = DynamicKMaxPool(3,1)
print(dynMaxPool(X,1))

tensor([[[0.4414, 0.1711, 0.1116],
         [0.3068, 0.9092, 0.3535],
         [0.2447, 0.4120, 0.1908]],

        [[0.9152, 0.8266, 0.0474],
         [0.0749, 0.3115, 0.2892],
         [0.0097, 0.6011, 0.7238]],

        [[0.9302, 0.6493, 0.8912],
         [0.0756, 0.8426, 0.9217],
         [0.9185, 0.0433, 0.9925]]])
3 0.0
tensor([[[0.4414, 0.1711, 0.1116],
         [0.3068, 0.9092, 0.3535],
         [0.2447, 0.4120, 0.1908]],

        [[0.9152, 0.8266, 0.0474],
         [0.0749, 0.3115, 0.2892],
         [0.0097, 0.6011, 0.7238]],

        [[0.9302, 0.6493, 0.8912],
         [0.0756, 0.8426, 0.9217],
         [0.9185, 0.0433, 0.9925]]])
