In [33]:
# imports
import json
import numpy as np
import os
import signal
import stanza
import subprocess
import time

from nltk.tokenize import sent_tokenize
from stanza.server import CoreNLPClient

In [54]:
def clear_port_9000():
    # clear the 9000 port to be used by the CoreNLPClient
    command = "netstat -ano | findstr 9000"
    c = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr = subprocess.PIPE)
    stdout, stderr = c.communicate()
    if len(stdout) > 0:    
        # process is running on port 9000, terminate it
        print("Netstat response:")
        print(stdout)
        print("-----------------------------------------------------")
        pid = input("Enter the process ID of the process that should be terminated, or an alphanumeric character in case it is unnecessary.")
        if not pid.isalpha():
            os.kill(int(pid), signal.SIGTERM)
            time.sleep(5)  # wait for the process to close

## 1.1 Head word extraction

In [72]:
def get_head_words(sentences):
    # extracts single-noun head words from a list of sentences

    clear_port_9000()
    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=sentences[0], pattern='NP')  # finds the noun phrases in the given text

    # extract the noun phrases and their indices
    noun_phrases = [[text, begin, end] for text, begin, end in
                zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])]

    # find head words that contain other head words (let's call them multiples)
    multiples = []
    for text1, i1, i2 in noun_phrases:
        for text2, j1, j2 in noun_phrases:
            if (text1 != text2) and (i1, i2 != j1, j2):
                if (i1 >= j1) & (i2 <= j2):
                    multiples.append([text2, j1, j2])
    noun_phrases = [[text, i1, i2] for [text, i1, i2] in noun_phrases if [text, i1, i2] not in multiples]

    # turn multi-word noun phrases into single head words
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse', verbose=False)
    [[text, i1, i2] for [text, i1, i2] in noun_phrases for sent in nlp(text.lower()).sentences for word in sent.words if word.deprel == "root"]

    return noun_phrases

## 1.2 Trigger extraction

In [None]:
def get_triggers():
    triggers = []
    # 1) get verbs
    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos', verbose=False)
    doc = nlp('Barack Obama was born in Hawaii. Tim Jongenelen was born in Zegge, but currently lives in Eindhoven after having moved from Roermond.')
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == 'VERB':
                triggers.append(word.text)

    # 2) get 'eventive' nouns from the two wordnet synsets
    # from the definitions and examples we can clearly see that we need the synsets 'event.n.01' and 'act.n.02'
    events = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
    acts = list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # 3) maintain only triggers where a head word serves as its subject, object or preposition

    return triggers

## 1.3 Drop head words that are not related to at least one trigger

## 1.4 Extract attributes of the remaining head words

# Testing section 1

In [73]:
# load sample article
with open('database_dump_drugs/0.json') as file:
    article = json.load(file)[0]['article_content'][:423]

In [74]:
# 1.1
sentences = [sent for sent in sent_tokenize(article)]
get_head_words(sentences=sentences)

[['An Ethiopian foreign national', 0, 29],
 ['eMzinoni', 46, 54],
 ['drugs', 70, 75],
 ['his tuck shop', 79, 92],
 ['Friday, 6', 96, 105]]