In [1]:
import json
import os
import signal
import stanza
import subprocess
import time

from nltk.corpus import wordnet as wn
from stanza.server import CoreNLPClient

In [2]:
# run once to install CoreNLP
stanza.install_corenlp()



# 1 - Entity extraction
"Entities are represented as a triple containing: a head word <i>h</i>, a list <i>A</A> of attribute relations, and a list <i>T</i> of trigger relations"

## 1.1 Head word extraction
The paper states that "Head words are extracted from noun phrases."  
Noun phrases are defined as: "A word or group of words containing a noun and functioning in a sentence as subject, object, or prepositional object."

In [3]:
def clear_port_9000():
    # clear the 9000 port to be used by the CoreNLPClient
    command = "netstat -ano | findstr 9000"
    c = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr = subprocess.PIPE)
    stdout, stderr = c.communicate()
    if len(stdout) > 0:    
        # process is running on port 9000, terminate it
        print("Netstat response:")
        print(stdout)
        print("-----------------------------------------------------")
        pid = input("Enter the process ID of the process that should be terminated, or an alphanumeric character in case it is unnecessary.")
        if not pid.isalpha():
            os.kill(int(pid), signal.SIGTERM)
            time.sleep(5)  # wait for the process to close

def get_head_words(text):
    # finds single head words from noun phrases in a given sentence

    # extract noun phrases
    clear_port_9000()
    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=text, pattern='NP')
    head_words = {text: (begin, end) for text, begin, end in
                zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                    [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])}

    # find head words that contain other head words
    multiples = []
    for t1, i1 in head_words.items():
        for t2, i2 in head_words.items():
            if (t1 != t2) and (i1 != i2):
                if ((i1[0] >= i2[0]) & (i1[1] < i2[1])) or ((i1[0] > i2[0]) & (i1[1] <= i2[1])):
                    multiples.append(i2)
    multiples = set(multiples)
    # remove these 'multiples'
    head_words = {key: val for key, val in head_words.items() if val not in multiples}

    # turn multi-word noun phrases into single head words (the authors clearly state the only use single head words)
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse', verbose=False)
    for nphrase in head_words.keys():
        for sent in nlp(nphrase.lower()).sentences:
            for word in sent.words:
                if word.deprel == "root":
                    head_words[word.text] = head_words.pop(nphrase)

    return head_words

## 1.2 Trigger extraction

In [4]:
def get_triggers():
    triggers = []
    # 1) get verbs
    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos', verbose=False)
    doc = nlp('Barack Obama was born in Hawaii. Tim Jongenelen was born in Zegge, but currently lives in Eindhoven after having moved from Roermond.')
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == 'VERB':
                triggers.append(word.text)

    # 2) get 'eventive' nouns from the two wordnet synsets
    # from the definitions and examples we can clearly see that we need the synsets 'event.n.01' and 'act.n.02'
    events = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
    acts = list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # 3) maintain only triggers where a head word serves as its subject, object or preposition

    return triggers

## 1.3 Drop head words that are not related to at least one trigger

In [5]:
# some more code

## 1.4 Extract head word attributes

In [6]:
# some more code

## Testing section 1

In [7]:
# open a given article to have some text to experiment with
with open('database_dump_drugs/0.json') as file:
    article = json.load(file)[0]['article_content']
sentence = article[:114]  # first sentence of the article
sentence

'An Ethiopian foreign national was arrested in eMzinoni for dealing in drugs at his tuck shop on Friday, 6 October.'

In [8]:
get_head_words(sentence)

Netstat response:
b'  TCP    127.0.0.1:9000         0.0.0.0:0              LISTENING       18288\r\n  TCP    127.0.0.1:59353        127.0.0.1:9000         TIME_WAIT       0\r\n'
-----------------------------------------------------


2021-07-14 15:50:58 INFO: Starting server with command: java -Xmx8G -cp C:\Users\timjo\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-2e15724b8064491b.props -preload -outputFormat serialized


{'eMzinoni': (46, 54),
 'friday': (96, 105),
 'national': (0, 29),
 'drugs': (70, 75),
 'shop': (79, 92)}