In [2]:
# imports
import json
import stanza
import warnings

from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from stanza.server import CoreNLPClient

## 1.1 Head word extraction

In [3]:
def get_head_words(sents, endpoint='8000'):
    # extracts single-noun head words from a list of sentences

    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', endpoint=f'http://localhost:{endpoint}', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=sents[0], pattern='NP')  # finds the noun phrases in the given text

    # extract the noun phrases and their indices
    noun_phrases = [[text, begin, end] for text, begin, end in
                zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])]

    # find head words that contain other head words (let's call them multiples)
    multiples = []
    for text1, i1, i2 in noun_phrases:
        for text2, j1, j2 in noun_phrases:
            if (text1 != text2) and (i1, i2 != j1, j2):
                if (i1 >= j1) & (i2 <= j2):
                    multiples.append([text2, j1, j2])
    noun_phrases = [[text, i1, i2] for [text, i1, i2] in noun_phrases if [text, i1, i2] not in multiples]
    
    # turn multi-word noun phrases into single head words
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse', verbose=False)
    head_words = []
    for [text, i1, i2] in noun_phrases:
        for word in nlp(text.lower()).sentences[0].words:
            if word.deprel == "root":
                _ = i1 + text.lower().find(word.text)  # start index of the root word in the original sentence
                head_words.append([word.text, _, _ + len(word.text)])

    # swap the word indices for their respective ids
    doc = nlp(sents[0])
    head_words = [[text, word.id] for [text, i1, i2] in head_words for sent in doc.sentences for word in sent.words if word.start_char == i1 and word.end_char == i2]    

    return head_words

## 1.2 Trigger extraction

In [4]:
def get_triggers(sents, head_words):
    triggers = []
    # 1) get verbs
    doc = nlp(sents[0])
    evnouns = []
    for sent in doc.sentences:
        for word in sent.words:
            evnouns.append([word.lemma, word.id])
            if word.upos == 'VERB':
                triggers.append(word.id)

    # 2) get 'eventive' nouns from the two wordnet synsets mentioned in the article
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        wn_evnouns = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
        wn_evnouns += list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # maintain only eventive nouns that weren't already identified as verbs
    triggers = sorted(triggers + [id for [lemma, id] in evnouns if lemma in wn_evnouns and id not in triggers])

    # 3) create pairs of triggers with the respective head word that serves as its subject, object or preposition
    triggers = [[word.id, word.head, word.deprel, word.xpos] for sent in doc.sentences for word in sent.words if (("subj" in word.deprel) or ("obj" in word.deprel) or ("IN" in word.xpos)) and (word.head in triggers) and (word.id in [x for [text, x] in head_words])]
    triggers = [[id1, head, deprel] if (("subj" in deprel) or ("obj" in deprel)) else [id1, head, xpos] for [id1, head, deprel, xpos] in triggers]

    return triggers

## 1.3 Drop head words that are not related to at least one trigger

## 1.4 Extract attributes of the remaining head words

# Testing section 1

In [5]:
# load sample article
with open('database_dump_drugs/0.json') as file:
    article = json.load(file)[0]['article_content'][:423]
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', verbose=False)

In [6]:
# testing 1.1
sentences = [sent for sent in sent_tokenize(article)]
hw = get_head_words(sents=sentences)
print(hw)

[['national', 4], ['emzinoni', 8], ['drugs', 12], ['shop', 16], ['friday', 18]]


In [7]:
# testing 1.2
tr = get_triggers(sents=sentences, head_words=hw)

In [8]:
# for demonstration purposes:
doc = nlp(sentences[0])
print(f'Input sentence: {sentences[0]}')
print(f'Head word: {doc.sentences[0].words[tr[0][0]-1].text}\t Trigger: {doc.sentences[0].words[tr[0][1]-1].text}\t Relation: {tr[0][2]}')

Input sentence: An Ethiopian foreign national was arrested in eMzinoni for dealing in drugs at his tuck shop on Friday, 6 October.
Head word: national	 Trigger: arrested	 Relation: nsubj:pass
