In [1]:
# imports
import json
import stanza
import warnings

from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from stanza.server import CoreNLPClient

## 1.1 Head word extraction

In [2]:
def get_head_words(sents, endpoint='8000'):
    # extracts single-noun head words from a list of sentences

    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', endpoint='http://localhost:8000', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=sents[0], pattern='NP')  # finds the noun phrases in the given text

    # extract the noun phrases and their indices
    noun_phrases = [[text, begin, end] for text, begin, end in
                zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])]

    # find head words that contain other head words (let's call them multiples)
    multiples = []
    for text1, i1, i2 in noun_phrases:
        for text2, j1, j2 in noun_phrases:
            if (text1 != text2) and (i1, i2 != j1, j2):
                if (i1 >= j1) & (i2 <= j2):
                    multiples.append([text2, j1, j2])
    noun_phrases = [[text, i1, i2] for [text, i1, i2] in noun_phrases if [text, i1, i2] not in multiples]
    
    # turn multi-word noun phrases into single head words
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse', verbose=False)
    head_words = []
    for [text, i1, i2] in noun_phrases:
        for word in nlp(text.lower()).sentences[0].words:
            if word.deprel == "root":
                head_words.append([word.text, word.start_char, word.end_char])

    return head_words

## 1.2 Trigger extraction

In [3]:
def get_triggers(sents):
    triggers = []
    # 1) get verbs
    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos', verbose=False)
    doc = nlp(sents[0])
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == 'VERB':
                triggers.append(word.text)

    # 2) get 'eventive' nouns from the two wordnet synsets
    # from the definitions and examples we can clearly see that we need the synsets 'event.n.01' and 'act.n.02'
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        evnouns = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
        evnouns += list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # check if any eventive nouns occur in the sentence
    

    # 3) maintain only triggers where a head word serves as its subject, object or preposition

    return triggers

## 1.3 Drop head words that are not related to at least one trigger

## 1.4 Extract attributes of the remaining head words

# Testing section 1

In [4]:
# load sample article
with open('database_dump_drugs/0.json') as file:
    article = json.load(file)[0]['article_content'][:423]

In [5]:
# testing 1.1
sentences = [sent for sent in sent_tokenize(article)]
get_head_words(sents=sentences)

2021-07-19 16:55:24 INFO: Starting server with command: java -Xmx8G -cp C:\Users\timjo\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 8000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-2e15724b8064491b.props -preload -outputFormat serialized


[['An Ethiopian foreign national', 0, 29], ['eMzinoni', 46, 54], ['drugs', 70, 75], ['his tuck shop', 79, 92], ['Friday, 6', 96, 105]]


[['national', 21, 29],
 ['emzinoni', 0, 8],
 ['drugs', 0, 5],
 ['shop', 9, 13],
 ['friday', 0, 6]]

In [6]:
# testing 1.2
get_triggers(sents=sentences)

['arrested', 'dealing']