In [1]:
# imports
import json
import stanza
import warnings

from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from stanza.server import CoreNLPClient

## 1.1 Head word extraction

In [2]:
def get_head_words(sents, endpoint='8000'):
    """
    Extracts single-noun head words from an article.

    1) Extracts all noun phrases, tagged with 'NP' by the CoreNLPClient,
    2) Removes multiples (NPs that contain other NPS),
    3) Transforms multi-word NPs into single-word head nouns based on a simple Stanza Pipeline.

    Parameters:
    sents (list): Sentence-tokenized article
    endpoint (string): Port to use for the CoreNLPClient

    Returns:
    list: Head word ids
    """

    # extract the noun phrases (tregex) and their indices
    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', endpoint=f'http://localhost:{endpoint}', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=sents[0], pattern='NP')

    noun_phrases = [[text, begin, end] for text, begin, end in
                zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])]

    # remove multiples
    multiples = []
    for text1, i1, i2 in noun_phrases:
        for text2, j1, j2 in noun_phrases:
            if (text1 != text2) and (i1, i2 != j1, j2):
                if (i1 >= j1) & (i2 <= j2):
                    multiples.append([text2, j1, j2])
    noun_phrases = [[text, i1, i2] for [text, i1, i2] in noun_phrases if [text, i1, i2] not in multiples]
    
    # turn multi-word noun phrases into single head words
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse', verbose=False)
    head_words = []
    for [text, i1, i2] in noun_phrases:
        for word in nlp(text.lower()).sentences[0].words:
            if word.deprel == "root":
                _ = i1 + text.lower().find(word.text)  # start index of the root word in the original sentence
                head_words.append([_, _ + len(word.text)])

    # swap the word indices for their respective ids
    doc = nlp(sents[0])
    head_words = [word.id for [i1, i2] in head_words for sent in doc.sentences for word in sent.words if word.start_char == i1 and word.end_char == i2]  

    return head_words

## 1.2 Trigger extraction

In [11]:
def get_triggers(sents, hw):
    """
    Extracts head words that have either a verb or an eventive noun as its subject/object/preposition.

    First extracts all triggers: verbs or eventive nouns based on the Wordnet Synsets indicated by the authors.
    Then goes through all head words in hw, and finds those that have a trigger as its subject/object/preposition.
    Finally, looks for transitive triggers, and extracts the correct subject from the root verb.

    Parameters:
    sents (list): sentence-tokenized article
    head_words (list): list of head word IDs

    Returns:
    list: Triplet lists of head word, trigger, and relation - e.g., [[4, 6, 'nsubj:pass']]
    """

    doc = nlp(sents[0])        
    verbs = [word.id for word in doc.sentences[0].words if word.upos == 'VERB']
    lemmas = [[word.lemma, word.id] for word in doc.sentences[0].words]

    # get 'eventive' nouns from the two wordnet synsets mentioned in the article and add these to the list of verbs, preventing duplicates
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        wn_evnouns = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
        wn_evnouns += list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # merge the triggers and wn_evnouns lists
    lemmas = [[word.lemma, word.id] for word in doc.sentences[0].words]
    triggers = sorted(verbs + [i for [lemma, i] in lemmas if lemma in wn_evnouns and i not in verbs])

    # find all head words that are related to a trigger
    triggers = [[word.id, word.head, word.deprel, word.xpos] for sent in doc.sentences for word in sent.words if (("subj" in word.deprel) or ("obj" in word.deprel) or ("IN" in word.xpos)) and (word.head in triggers) and (word.id in hw)]

    # get either the deptrel or xpos, depending on whether it is a subject, object or preposition
    triggers = [[id1, head, rel_dep] if (("subj" in rel_dep) or ("obj" in rel_dep)) else [id1, head, rel_xpos] for [id1, head, rel_dep, rel_xpos] in triggers]

    # check for transitive triggers and get the subject from the root
    for verb in verbs:
        head = doc.sentences[0].words[verb-1].head
        if head != 0:
            for [n, v, rel] in triggers:
                if v == head and rel == 'nsubj':
                    triggers.append([n, verb, rel])


    return triggers

## 1.3 Extract attributes of the remaining head words

# Testing section

In [4]:
# load sample article
with open('database_dump_drugs/0.json') as file:
    article = json.load(file)[0]['article_content'][:423]
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', verbose=False)

In [5]:
# testing 1.1
sentences = sent_tokenize(article)
sentences[0] = 'Two armed men attacked the police station and killed a policeman.'
head_words = get_head_words(sents=sentences)
print(head_words)

[3, 7, 11]


In [12]:
# testing 1.2
tr = get_triggers(sents=sentences, hw=head_words)

In [15]:
# for demonstration purposes:
doc = nlp(sentences[0])
print(f'Input sentence: {sentences[0]}')
for x in range(len(tr)):
    print(f'Head word: {doc.sentences[0].words[tr[x][0]-1].text}\t Trigger: {doc.sentences[0].words[tr[x][1]-1].text}\t Relation: {tr[x][2]}')

Input sentence: Two armed men attacked the police station and killed a policeman.
Head word: men	 Trigger: attacked	 Relation: nsubj
Head word: station	 Trigger: attacked	 Relation: obj
Head word: policeman	 Trigger: killed	 Relation: obj
Head word: men	 Trigger: killed	 Relation: nsubj
