In [1]:
import json
import stanza
from stanza.server import CoreNLPClient

# 3 - Entity Representation
"Entities are represented as a triple containing: a head word <i>h</i>, a list <i>A</A> of attribute relations and a list <i>T</i> of trigger relations"

In [9]:
def create_pipeline():
    stanza.download('en', processors='tokenize,pos,ner,lemma,depparse', logging_level='WARN')
    return stanza.Pipeline('en', processors='tokenize,pos,ner,lemma,depparse', logging_level='WARN')

def sample_article():
    file = json.load(open('database_dump_drugs/0.json'))
    return file[0]['article_content']


In [10]:
pipeline = create_pipeline()
article = sample_article()
doc = pipeline(article)

## 3.1 Head word extraction
The paper states that "Head words are extracted from noun phrases."  
Noun phrases are defined as: "A word or group of words containing a noun and functioning in a sentence as subject, object, or prepositional object."  
The authors only use single words as head words. So in the case of the first sentence of the first article, the head word of 'An Ethiopean foreign national' is 'national'.  
  
To do this in Python, we use Stanza's dependency parser, following https://stanfordnlp.github.io/stanza/depparse.html

In [11]:
doc

[
  [
    {
      "id": 1,
      "text": "An",
      "lemma": "a",
      "upos": "DET",
      "xpos": "DT",
      "feats": "Definite=Ind|PronType=Art",
      "head": 4,
      "deprel": "det",
      "start_char": 0,
      "end_char": 2,
      "ner": "O"
    },
    {
      "id": 2,
      "text": "Ethiopian",
      "lemma": "Ethiopian",
      "upos": "ADJ",
      "xpos": "JJ",
      "feats": "Degree=Pos",
      "head": 4,
      "deprel": "amod",
      "start_char": 3,
      "end_char": 12,
      "ner": "S-NORP"
    },
    {
      "id": 3,
      "text": "foreign",
      "lemma": "foreign",
      "upos": "ADJ",
      "xpos": "JJ",
      "feats": "Degree=Pos",
      "head": 4,
      "deprel": "amod",
      "start_char": 13,
      "end_char": 20,
      "ner": "O"
    },
    {
      "id": 4,
      "text": "national",
      "lemma": "national",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "head": 6,
      "deprel": "nsubj:pass",
      "start_char": 21,
      "en