In [80]:
import stanza

# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None)

def trunk_construction(str, parent_label = None):
    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = construct_sentence(tree, parent_label)
    return ' '.join(words)

def construct_sentence(tree, parent_label = None, leave_pos=False):

    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + construct_sentence(child, tree.label)

    return sentences

2023-04-01 20:29:12 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-01 20:29:12 INFO: Using device: cpu
2023-04-01 20:29:12 INFO: Loading: tokenize
2023-04-01 20:29:12 INFO: Loading: pos
2023-04-01 20:29:12 INFO: Loading: constituency
2023-04-01 20:29:12 INFO: Done loading processors!


In [81]:
def test_parser(str, valid_sentence):

    new_sentence = trunk_construction(str)
    #new_sentence = ' '.join(words)
    assert new_sentence == valid_sentence

In [82]:
test_parser('Syrian forces launch new attacks', "forces launch attacks")
test_parser("""the flat tire was replaced by the driver""","tire was replaced driver")
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
           "Amrozi accused brother called witness distorting evidence")
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            "Shares Genentech company products market rose percent")
test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
             "Gyorgy Heizler head disaster unit said coach was carrying passengers")
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
           "Referring witness Amrozi accused brother distorting evidence")
test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
            "wife said was percent George Bush looked using years training war")


In [None]:
from torch.utils.data import DataLoader, Dataset

class MRPCDataset(Dataset):
    def __init__(self, string1, string2, quality):
        self.string1 = string1
        self.string2 = string2
        self.quality = quality

    def __len__(self):
        return len(self.string1)

    def __getitem__(self, index):
        return self.string1, self.string2, self.quality

In [83]:
import pandas as pd

def read_file(file_name):
    # Note: Unable to use pd.read_csv... the function complained about an issue with the formatting of the tsv file
    # train = pd.read_csv('data/msr_paraphrase_train.txt', sep='\t', encoding='latin1')
    # train

    # opting to read file in and split columns manually to create a pandas dataframe
    list = []
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            fields = line.split('\t')
            list.append(fields)

    df = pd.DataFrame(list[1:], columns=['Quality', 'ID1', 'ID2', 'String1', 'String2'])
    return df

In [None]:
#def collate_fn()

In [104]:
import gensim

df = read_file('data/msr_paraphrase_train.txt')

processed_string1 = df[:10].String1.apply(trunk_construction)
processed_string2 = df[:10].String2.apply(trunk_construction)

processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)

In [105]:
len(processed_string1)

10

In [106]:
processed_string2

0    [referring, witness, amrozi, accused, brother,...
1           [yucaipa, bought, dominick, sold, safeway]
2    [june, ship, owners, had, published, advertise...
3     [tab, shares, jumped, cents, set, closing, high]
4    [shares, jumped, percent, stock, exchange, fri...
5    [scandal, hanging, stewart, company, revenue, ...
6                     [ixic, rallied, points, percent]
7       [dvd, cca, appealed, decision, supreme, court]
8     [earnings, were, affected, tax, benefit, period]
9              [business, does, fit, growth, strategy]
Name: String2, dtype: object

In [140]:
from gensim.models import Word2Vec

corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)

model = Word2Vec(sentences=corpus, vector_size=(len(processed_string1) + len(processed_string2)), min_count=2)
#model.build_vocab(sentences=corpus)
#model.train(corpus, total_examples=model.corpus_count, epochs=5)

In [142]:
for index, word in enumerate(model.wv.index_to_key):
    if index == 120:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

word #0/44 is percent
word #1/44 is period
word #2/44 is year
word #3/44 is cents
word #4/44 is shares
word #5/44 is stock
word #6/44 is friday
word #7/44 is had
word #8/44 is tab
word #9/44 is sale
word #10/44 is offering
word #11/44 is june
word #12/44 is internet
word #13/44 is jumped
word #14/44 is were
word #15/44 is published
word #16/44 is safeway
word #17/44 is dominick
word #18/44 is yucaipa
word #19/44 is evidence
word #20/44 is distorting
word #21/44 is witness
word #22/44 is brother
word #23/44 is accused
word #24/44 is advertisement
word #25/44 is high
word #26/44 is set
word #27/44 is strategy
word #28/44 is growth
word #29/44 is company
word #30/44 is fit
word #31/44 is does
word #32/44 is business
word #33/44 is court
word #34/44 is supreme
word #35/44 is appealed
word #36/44 is cca
word #37/44 is dvd
word #38/44 is closing
word #39/44 is dropped
word #40/44 is quarter
word #41/44 is revenue
word #42/44 is exchange
word #43/44 is amrozi


In [462]:
# Testing the spacy library to extract spo
# This is only test code and should not be uncommented.

# import spacy
#
# def get_spacy_subject_phrase(doc):
#     for token in doc:
#         if ("subj" in token.dep_):
#             return token
#             # subtree = list(token.subtree)
#             # start = subtree[0].i
#             # end = subtree[-1].i + 1
#             # return doc[start:end]
#
# def get_spacy_predicate_phrase(doc):
#     for token in doc:
#         if ("ROOT" in token.dep_):
#             subtree = list(token.subtree)
#             start = subtree[0].i
#             end = subtree[-1].i + 1
#             return token
#             # return doc[start:end]
#
# def get_spacy_object_phrase(doc):
#     for token in doc:
#         if ("dobj" in token.dep_):
#             return token
#             # subtree = list(token.subtree)
#             # start = subtree[0].i
#             # end = subtree[-1].i + 1
#             # return doc[start:end]
#
# def spacy_find_spo(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     sentence = next(doc.sents)
#     previous = None
#     new_sentence = []
#     for word in sentence:
#         print(f"{word} : {word.dep_}")
#         add_word = None
#         if "subj" in word.dep_:
#             add_word = word
#         if "ROOT" in word.dep_:
#             add_word = word
#         if "pobj" in word.dep_:
#             add_word = word
#         if "dobj" in word.dep_:
#             add_word = word
#         # if "prep" in word.dep_:
#         #     add_word = word
#         if "ccomp" in word.dep_:
#             add_word = word
#         if "pcomp" in word.dep_:
#             add_word = word
#         if add_word is not None:
#             if previous is not None and "compound" in previous.dep_:
#                 new_sentence.append(previous)
#             new_sentence.append(add_word)
#         previous = word
#     return new_sentence
#
# def find_spacy_subject(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     for token in doc:
#         # Check if the token is a verb and has a subject
#         if token.dep_ == "ROOT":
#             for child in token.children:
#                 if child.dep_ == "nsubj":
#                     #subject = child.text
#                     predicate = ' '.join(child.text for child in token.children)
#                 break
#     return predicate
#
# def find_spacy_object(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     for token in doc:
#         # Check if the token is a verb and has a subject
#         if token.dep_ == "ROOT":
#             for child in token.children:
#                 if child.dep_ == "nsubj":
#                     obj = child.text
#                 break
#     return obj
#
# #print(spacy_find_spo("Syrian forces launch new attack"))
# #print(spacy_find_spo("Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent"))
# #print(spacy_find_spo("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers."""))
# #print(spacy_find_spo("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence."""))
# #print(spacy_find_spo("""Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war."""))
# print(spacy_find_spo("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war."""))

His : poss
wife : nsubj
said : ROOT
he : nsubj
was : ccomp
" : punct
100 : nummod
percent : npadvmod
behind : prep
George : compound
Bush : pobj
" : punct
and : cc
looked : conj
forward : advmod
to : prep
using : pcomp
his : poss
years : dobj
of : prep
training : pobj
in : prep
the : det
war : pobj
. : punct
[wife, said, he, was, George, Bush, using, years, training, war]
