In [1]:
# imports
import json
import stanza
import warnings

from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from stanza.server import CoreNLPClient

## 1.1 Head word extraction

In [2]:
def get_head_words(text, endpoint='8000'):
    """
    Extracts single-noun head words from an article.

    1) Extracts all noun phrases, tagged with 'NP' by the CoreNLPClient,
    2) Removes multiples (NPs that contain other NPS),
    3) Transforms multi-word NPs into single-word head nouns based on a simple Stanza Pipeline.

    Parameters:
    sents (list): Sentence-tokenized article
    endpoint (string): Port to use for the CoreNLPClient

    Returns:
    list: Head word sentence + word IDs
    """

    # extract the noun phrases (tregex) and their indices
    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', endpoint=f'http://localhost:{endpoint}', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=text, pattern='NP')

    noun_phrases = [[text, begin, end] for text, begin, end in
                zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])]

    # remove multiples
    multiples = []
    for text1, i1, i2 in noun_phrases:
        for text2, j1, j2 in noun_phrases:
            if (text1 != text2) and (i1, i2 != j1, j2):
                if (i1 >= j1) & (i2 <= j2):
                    multiples.append([text2, j1, j2])
    noun_phrases = [[text, i1, i2] for [text, i1, i2] in noun_phrases if [text, i1, i2] not in multiples]

    # turn multi-word noun phrases into single head words
    head_words = []

    for [t, i1, i2] in noun_phrases:
        for sent in nlp(t).sentences:
            for word in sent.words:
                if word.deprel == "root":
                    _ = i1 + t.find(word.text)  # start index of the root word in the original sentence
                    head_words.append([_, _ + len(word.text)])

    doc = nlp(text)                
    head_words = [[sent.id, word.id] for sent in doc.sentences for word in sent.words for [i1, i2] in head_words if word.start_char == i1 and word.end_char == i2]
    
    return head_words

## 1.2 Trigger extraction

In [3]:
def get_triggers(sents, hw):
    """
    Extracts head words that have either a verb or an eventive noun as its subject/object/preposition.

    1) Extract all verbs
    2) Get eventive nouns based on the Wordnet Synsets indicated by the authors
    3) Combine 1 and 2 into a list of trigger candidates
    4) Finds all head words that have a trigger as its subject/object/preposition.
    5) Looks for transitive triggers, and extracts the correct subject from the root verb.

    Parameters:
    sents (list): sentence-tokenized article
    head_words (list): list of head word IDs

    Returns:
    list: Triplet lists of head word, trigger, and relation - e.g., [[4, 6, 'nsubj']]
    """

    # parse the text and extract all verbs and lemmas
    doc = nlp(text)
    verbs = [[sent.id, word.id] for sent in doc.sentences for word in sent.words if word.upos == 'VERB']
    lemmas = [[word.lemma, sent.id, word.id] for sent in doc.sentences for word in sent.words]

    # generate a list of eventive nouns
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        wn_evnouns = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
        wn_evnouns += list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # generates a list of trigger candidates based on the verbs and eventive nouns in the text
    candidates = verbs + [[s,w] for [lemma, s, w] in lemmas if lemma in wn_evnouns and [s, w] not in verbs]

    # finds all head word - trigger dyads and their syntactic relation
    triggers = []
    for sent in doc.sentences:
        for word in sent.words:
            if ([sent.id, word.head] in candidates) and ([sent.id, word.id] in hw):
                if ("IN" in word.xpos):
                    triggers.append([sent.id, word.id, word.head, word.xpos])
                elif any(_ in word.deprel for _ in ["subj", "obj"]):
                    triggers.append([sent.id, word.id, word.head, word.deprel])

    for [sent, word] in verbs:
        head = doc.sentences[sent].words[word-1].head
        if head != 0:
            for [_, noun, verb, rel] in triggers:
                if verb == head and ('subj' in rel):
                    triggers.append([sent, noun, word, rel])


    return triggers

## 1.3 Extract attributes of the remaining head words

# Testing section

In [4]:
# load sample article
with open('database_dump_drugs/0.json') as file:
    article = json.load(file)[0]['article_content'][:423]
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', verbose=False)

In [5]:
# testing 1.1
text = 'Two armed men attacked the police station and killed a policeman. An innocent young man was also wounded'
head_words = get_head_words(text=text)
head_words

[[0, 3], [0, 7], [0, 11], [1, 4]]

In [6]:
# testing 1.2
tr = get_triggers(sents=text, hw=head_words)
tr

[[0, 3, 4, 'nsubj'],
 [0, 7, 4, 'obj'],
 [0, 11, 9, 'obj'],
 [1, 4, 7, 'nsubj:pass'],
 [0, 3, 9, 'nsubj']]

In [8]:
# for demonstration purposes:
doc = nlp(text)
print(f'Input text: {text}')
for x in range(len(tr)):
    print(f'Head word: {doc.sentences[tr[x][0]].words[tr[x][1]-1].text}\t Trigger: {doc.sentences[tr[x][0]].words[tr[x][2]-1].text}\t Relation: {tr[x][3]}')

Input text: Two armed men attacked the police station and killed a policeman. An innocent young man was also wounded
Head word: men	 Trigger: attacked	 Relation: nsubj
Head word: station	 Trigger: attacked	 Relation: obj
Head word: policeman	 Trigger: killed	 Relation: obj
Head word: man	 Trigger: wounded	 Relation: nsubj:pass
Head word: men	 Trigger: killed	 Relation: nsubj
