In [1]:
import json
import stanza

from stanza.server import CoreNLPClient

In [2]:
# run once to install CoreNLP
stanza.install_corenlp()

# 1 - Entity extraction
"Entities are represented as a triple containing: a head word <i>h</i>, a list <i>A</A> of attribute relations, and a list <i>T</i> of trigger relations"

In [2]:
def sample_article():
    file = json.load(open('database_dump_drugs/0.json'))
    return file[0]['article_content']

## 1.1 Head word extraction
The paper states that "Head words are extracted from noun phrases."  
Noun phrases are defined as: "A word or group of words containing a noun and functioning in a sentence as subject, object, or prepositional object."

In [3]:
article = sample_article()
sent = article[:114]  # first sentence of the article

with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props') as client:
    pattern = 'NP'
    matches = client.tregex(text=sent, pattern=pattern)

noun_phrases = [sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence]

2021-07-05 18:38:50 INFO: Starting server with command: java -Xmx5G -cp C:\Users\s161158\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-2e15724b8064491b.props -preload -outputFormat serialized


In [4]:
noun_phrases

['An Ethiopian foreign national',
 'eMzinoni',
 'drugs at his tuck shop',
 'drugs',
 'his tuck shop',
 'Friday, 6']

The authors only use single words as head words. In the case of the first sentence of the first article, the head word of 'An Ethiopian foreign national' is 'national'.
Therefore, we will have to reduce the extracted noun phrases to single nouns.

In [5]:
# In order to find the head nouns, we need POS-tagging and dependency parsing:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp(sent)
# extract all candidate head words according to the Stanza dependency parser:
heads = set([sent.words[word.head-1].text for sent in doc.sentences for word in sent.words])
# keep only the candidate head words that are nouns
heads = [word.text for sent in doc.sentences for word in sent.words if "NN" in word.xpos and word.text in heads]

2021-07-05 18:39:55 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-07-05 18:39:55 INFO: Use device: cpu
2021-07-05 18:39:55 INFO: Loading: tokenize
2021-07-05 18:39:55 INFO: Loading: pos
2021-07-05 18:39:56 INFO: Loading: lemma
2021-07-05 18:39:56 INFO: Loading: depparse
2021-07-05 18:39:57 INFO: Done loading processors!


In [6]:
heads

['national', 'eMzinoni', 'drugs', 'shop', 'Friday', 'October']

In [7]:
noun_phrases

['An Ethiopian foreign national',
 'eMzinoni',
 'drugs at his tuck shop',
 'drugs',
 'his tuck shop',
 'Friday, 6']

In [None]:
# finally, output the words that were in the noun phrases, as well as the candidate head word list
for phrase in noun_phrases:
    for head in heads:
        if head in phrase:
            # noun_phrases[noun_phrases.index(phrase)] = head
# noun_phrases = set(noun_phrases)


## 1.2 Trigger extraction

In [None]:
# 1) get verbs

# 2) get 'eventive' nouns from the two wordnet synsets

## 1.3 Drop head words that are not related to at least one trigger

In [None]:
# some more code

## 1.4 Extract head word attributes

In [None]:
# some more code