In [3]:
import stanza

# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None)


2023-04-01 14:57:32 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-01 14:57:32 INFO: Using device: cpu
2023-04-01 14:57:32 INFO: Loading: tokenize
2023-04-01 14:57:32 INFO: Loading: pos
2023-04-01 14:57:32 INFO: Loading: constituency
2023-04-01 14:57:33 INFO: Done loading processors!


In [23]:
def trunk_construction(tree, parent_label = None, leave_pos=False):
    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + trunk_construction(child, tree.label)

    return sentences

In [25]:
def test_parser(str, valid_sentence):

    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = trunk_construction(tree)
    new_sentence = ' '.join(words)
    assert new_sentence == valid_sentence

In [34]:
# test_parser("""Amrozi accused his brother of deliberately distorting his evidence.""", "Amrozi", "distorting", "evidence")
test_parser('Syrian forces launch new attacks', "forces launch attacks")
test_parser("""the flat tire was replaced by the driver""","tire was replaced driver")
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
           "Amrozi accused brother called witness distorting evidence")
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            "Shares Genentech company products market rose percent")
test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
             "Gyorgy Heizler head disaster unit said coach was carrying passengers")
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
           "Referring witness Amrozi accused brother distorting evidence")
test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
            "wife said was percent George Bush looked using years training war")


In [321]:
from torch.utils.data import DataLoader, Dataset

class MRPCDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        sentence1, sentence2 = self.x[index]
        sim = self.y[index]
        return sentence1, sentence2, sim

In [322]:


sentences = [
    """Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
    """Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
    "Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.",
    "Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998."
    """His wife said he was "100 percent" behind George Bush" and looked forward to using his years of training in the war."""
]

process_docs(sentences)
#w2v = word2vec(sentences, min_cound=1, size=5)

['(NNP Amrozi) (VBG distorting) (NN evidence)',
 '(NN witness) (VBG Referring) (NN witness)',
 '(NNP Yucaipa) (VBG selling) (NNP Safeway)',
 '(NNP Yucaipa) (VBD sold) (NNP Safeway)']

In [462]:
# Testing the spacy library to extract spo
# This is only test code and should not be uncommented.

# import spacy
#
# def get_spacy_subject_phrase(doc):
#     for token in doc:
#         if ("subj" in token.dep_):
#             return token
#             # subtree = list(token.subtree)
#             # start = subtree[0].i
#             # end = subtree[-1].i + 1
#             # return doc[start:end]
#
# def get_spacy_predicate_phrase(doc):
#     for token in doc:
#         if ("ROOT" in token.dep_):
#             subtree = list(token.subtree)
#             start = subtree[0].i
#             end = subtree[-1].i + 1
#             return token
#             # return doc[start:end]
#
# def get_spacy_object_phrase(doc):
#     for token in doc:
#         if ("dobj" in token.dep_):
#             return token
#             # subtree = list(token.subtree)
#             # start = subtree[0].i
#             # end = subtree[-1].i + 1
#             # return doc[start:end]
#
# def spacy_find_spo(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     sentence = next(doc.sents)
#     previous = None
#     new_sentence = []
#     for word in sentence:
#         print(f"{word} : {word.dep_}")
#         add_word = None
#         if "subj" in word.dep_:
#             add_word = word
#         if "ROOT" in word.dep_:
#             add_word = word
#         if "pobj" in word.dep_:
#             add_word = word
#         if "dobj" in word.dep_:
#             add_word = word
#         # if "prep" in word.dep_:
#         #     add_word = word
#         if "ccomp" in word.dep_:
#             add_word = word
#         if "pcomp" in word.dep_:
#             add_word = word
#         if add_word is not None:
#             if previous is not None and "compound" in previous.dep_:
#                 new_sentence.append(previous)
#             new_sentence.append(add_word)
#         previous = word
#     return new_sentence
#
# def find_spacy_subject(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     for token in doc:
#         # Check if the token is a verb and has a subject
#         if token.dep_ == "ROOT":
#             for child in token.children:
#                 if child.dep_ == "nsubj":
#                     #subject = child.text
#                     predicate = ' '.join(child.text for child in token.children)
#                 break
#     return predicate
#
# def find_spacy_object(str):
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(str)
#     for token in doc:
#         # Check if the token is a verb and has a subject
#         if token.dep_ == "ROOT":
#             for child in token.children:
#                 if child.dep_ == "nsubj":
#                     obj = child.text
#                 break
#     return obj
#
# #print(spacy_find_spo("Syrian forces launch new attack"))
# #print(spacy_find_spo("Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent"))
# #print(spacy_find_spo("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers."""))
# #print(spacy_find_spo("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence."""))
# #print(spacy_find_spo("""Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war."""))
# print(spacy_find_spo("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war."""))

His : poss
wife : nsubj
said : ROOT
he : nsubj
was : ccomp
" : punct
100 : nummod
percent : npadvmod
behind : prep
George : compound
Bush : pobj
" : punct
and : cc
looked : conj
forward : advmod
to : prep
using : pcomp
his : poss
years : dobj
of : prep
training : pobj
in : prep
the : det
war : pobj
. : punct
[wife, said, he, was, George, Bush, using, years, training, war]
