In [154]:
import stanza

# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None)


2023-03-30 09:47:36 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-03-30 09:47:36 INFO: Using device: cpu
2023-03-30 09:47:36 INFO: Loading: tokenize
2023-03-30 09:47:36 INFO: Loading: pos
2023-03-30 09:47:37 INFO: Loading: constituency
2023-03-30 09:47:37 INFO: Done loading processors!


In [155]:
doc = nlp('Syrian forces launch new attacks')
for sentence in doc.sentences:
    print(sentence.constituency)

(ROOT (S (NP (JJ Syrian) (NNS forces)) (VP (VBP launch) (NP (JJ new) (NNS attacks)))))


In [156]:
tree = doc.sentences[0].constituency

In [157]:
def equals(a, b):
    return a == b

def starts_with(a, b):
    return a.startswith(b)

def is_in(a, b):
    return a in b

def find_pos(tree, comparator, labels):
    # if tree.label in labels:
    #print(f"find_pos: {tree} : {tree.label} : {labels} : {comparator(tree.label, labels)}")
    if comparator(tree.label, labels):
        return tree
    if not tree.is_leaf():
        for child in tree.children:
            pos = find_pos(child, comparator, labels)
            if pos is not None:
                return pos

    return None

def trunk_construction(tree):
    subject, predicate, object = [], [], []

    noun_phrase = find_pos(tree, is_in, ["NP"])
    verb_phrase = find_pos(tree, is_in, ["VP"])
    subject = find_pos(noun_phrase, starts_with, 'NN')
    predicate = find_pos(verb_phrase, starts_with, 'VB')
    for child in verb_phrase.children:
        np_vp = find_pos(child, is_in, ['NP', 'VP'])
        if np_vp is not None:
            object = find_pos(np_vp, starts_with, 'NN')

        object = find_pos(child, starts_with, 'NN')

    return subject, predicate, object


In [159]:
def test_stanza_parser():

    doc = nlp('Syrian forces launch new attacks')
    tree = doc.sentences[0].constituency

    subject, predicate, object =  trunk_construction(tree)

    print(f"subject: {subject}")
    print(f"predicate: {predicate}")
    print(f"object: {object}")

    assert subject.label == 'NNS'
    assert subject.children[0].label == 'forces'
    assert predicate.label == 'VBP'
    assert predicate.children[0].label == 'launch'
    assert object.label == 'NNS'
    assert object.children[0].label == 'attacks'

test_stanza_parser()

subject: (NNS forces)
predicate: (VBP launch)
object: (NNS attacks)


'forces'