In [1]:
import json
import os
import signal
import stanza
import subprocess


from nltk.corpus import wordnet as wn
from stanza.server import CoreNLPClient

In [2]:
# run once to install CoreNLP
stanza.install_corenlp()



# 1 - Entity extraction
"Entities are represented as a triple containing: a head word <i>h</i>, a list <i>A</A> of attribute relations, and a list <i>T</i> of trigger relations"

## 1.1 Head word extraction
The paper states that "Head words are extracted from noun phrases."  
Noun phrases are defined as: "A word or group of words containing a noun and functioning in a sentence as subject, object, or prepositional object."

In [2]:
def get_head_words(text):
    # finds single head words from noun phrases in a given sentence

    # extract noun phrases
    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props') as client:
        pattern = 'NP'
        matches = client.tregex(text=text, pattern=pattern)
    noun_phrases = [sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence]

    # find which noun phrases contain multiple nouns
    doubles = []
    for sentence in matches['sentences']:
        match_ids = range(len(sentence))
        for m in match_ids:
            # for each noun phrase m
            for n in match_ids:
                # check if there is another noun phrase n that has both start >= and end <= m
                if (sentence[str(n)]['characterOffsetBegin'] >= sentence[str(m)]['characterOffsetBegin']) \
                        & (sentence[str(n)]['characterOffsetEnd'] <= sentence[str(m)]['characterOffsetEnd']) \
                        & (m != n) \
                        & (m not in doubles):
                    # if so, then m contains n, so we only want to keep n
                    doubles.append(m)
    for n in doubles:
        del(noun_phrases[n])

    # turn multi-word noun phrases into single head words
    head_words = []
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma,depparse')
    for phrase in noun_phrases:
        doc = nlp(phrase.lower())
        for sent in doc.sentences:
            for word in sent.words:
                if word.deprel == "root":
                    head_words.append(word.text)
    return head_words

## 1.2 Trigger extraction

In [4]:
def get_triggers():
    triggers = []
    # 1) get verbs
    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')
    doc = nlp('Barack Obama was born in Hawaii. Tim Jongenelen was born in Zegge, but currently lives in Eindhoven after having moved from Roermond.')
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == 'VERB':
                triggers.append(word.text)

    # 2) get 'eventive' nouns from the two wordnet synsets
    # from the definitions and examples we can clearly see that we need the synsets 'event.n.01' and 'act.n.02'
    events = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
    acts = list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # 3) maintain only triggers where a head word serves as its subject, object or preposition

    return triggers

## 1.3 Drop head words that are not related to at least one trigger

In [5]:
# some more code

## 1.4 Extract head word attributes

In [6]:
# some more code

## Testing section 1

In [3]:
# open a given article to have some text to experiment with
with open('database_dump_drugs/0.json') as file:
    article = json.load(file)[0]['article_content']
sentence = article[:114]  # first sentence of the article
sentence

'An Ethiopian foreign national was arrested in eMzinoni for dealing in drugs at his tuck shop on Friday, 6 October.'

In [12]:
# clear the 9000 port to be used by the CoreNLPClient
command = "netstat -ano | findstr 9000"
c = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr = subprocess.PIPE)
stdout, stderr = c.communicate()
if len(stdout) > 0:    
    # process is running on port 9000, terminate it
    print(stdout)
    pid = input("Enter the process ID of the process that should be terminated, or an alphanumeric character in case it is unnecessary.")
    if not pid.isalpha():
        os.kill(pid, signal.SIGTERM)

get_head_words(sentence)


b'  TCP    [::1]:57698            [::1]:9000             TIME_WAIT       0\r\n'


2021-07-14 12:11:24 INFO: Starting server with command: java -Xmx5G -cp C:\Users\timjo\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-2e15724b8064491b.props -preload -outputFormat serialized
2021-07-14 12:11:42 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-07-14 12:11:42 INFO: Use device: cpu
2021-07-14 12:11:42 INFO: Loading: tokenize
2021-07-14 12:11:42 INFO: Loading: pos
2021-07-14 12:11:42 INFO: Loading: lemma
2021-07-14 12:11:42 INFO: Loading: depparse
2021-07-14 12:11:42 INFO: Done loading processors!


['national', 'emzinoni', 'drugs', 'shop', 'friday']