In [14]:
# imports
import warnings

from itertools import combinations
from json import load
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from stanza import Pipeline
from stanza.server import CoreNLPClient

## 1.1 Head word extraction

In [2]:
def get_head_words(text, endpoint='8000'):
    """
    Extracts single-noun head words from an article.

    1) Extracts all noun phrases, tagged with 'NP' by the CoreNLPClient,
    2) Removes multiples (NPs that contain other NPS),
    3) Transforms multi-word NPs into single-word head nouns based on a simple Stanza Pipeline.
    4) Converts the list to a general sentence ID + word ID representation

    Parameters:
    text (string): Article

    Returns:
    list: Head word sentence IDs & word IDs
    """

    # extract all noun phrases from the article
    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', endpoint=f'http://localhost:{endpoint}', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=text, pattern='NP')
    # reformat the data structure into a list of lists
    noun_phrases = [[text, sent, begin, end] for text, sent, begin, end in
                    zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                        [sentence[match_id]['sentIndex'] for sentence in matches['sentences'] for match_id in sentence],
                        [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                        [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])]

    # remove 'multiples'
    _ = [np1 for np1 in noun_phrases for np2 in noun_phrases if (np1 != np2) and (np1[1] == np2[1]) and ((np1[2] <= np2[2]) and (np1[3] >= np2[3]))]
    noun_phrases = [np for np in noun_phrases if np not in _]

    # convert multi-word noun phrases into single-word head nouns
    doc = nlp(text)
    head_words = [[np[2] + np[0].find(word.text), np[2] + np[0].find(word.text) + len(word.text)] for np in noun_phrases for sent in nlp(np[0]).sentences for word in sent.words if word.deprel == "root"]

    # convert the word indices into sentence ID and word ID pairs
    head_words = [[sent.id, word.id] for sent in doc.sentences for word in sent.words for [i, j] in head_words if word.start_char == i and word.end_char == j]
    
    return head_words

## 1.2 Trigger extraction

In [3]:
def get_triggers(text, hw):
    """
    Creates head word-trigger pairings.

    1) Extract all verbs
    2) Get eventive nouns based on the Wordnet Synsets indicated by the authors
    3) Combine 1 and 2 into a list of trigger candidates
    4) Finds all head words that have a trigger as its subject/object/preposition.
    5) Looks for transitive triggers, and extracts the correct subject from the root verb.

    Parameters:
    text (string): article
    hw (list): list of head word IDs

    Returns:
    list: List of head word, trigger, and relations - e.g., [[4, 6, 'nsubj']]
    """

    # parse the text and extract all verbs and lemmas
    doc = nlp(text)
    verbs = [[sent.id, word.id] for sent in doc.sentences for word in sent.words if word.upos == 'VERB']
    lemmas = [[word.lemma, sent.id, word.id] for sent in doc.sentences for word in sent.words]

    # generate a list of eventive nouns
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        wn_evnouns = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
        wn_evnouns += list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # generates a list of trigger candidates based on the verbs and eventive nouns in the text
    candidates = verbs + [[s,w] for [lemma, s, w] in lemmas if lemma in wn_evnouns and [s, w] not in verbs]

    # finds all head word - trigger dyads and their syntactic relation
    triggers = []
    for sent in doc.sentences:
        for word in sent.words:
            if ([sent.id, word.head] in candidates) and ([sent.id, word.id] in hw):
                if ("IN" in word.xpos):
                    triggers.append([sent.id, word.id, word.head, word.xpos])
                elif any(_ in word.deprel for _ in ["subj", "obj"]):
                    triggers.append([sent.id, word.id, word.head, word.deprel])

    for [sent, word] in verbs:
        head = doc.sentences[sent].words[word-1].head
        if head != 0:
            for [_, noun, verb, rel] in triggers:
                if verb == head and ('subj' in rel):
                    triggers.append([sent, noun, word, rel])


    return triggers

## 1.3 Attribute extraction

In [16]:
def get_attributes(text, hw):
    """
    Adds attributes to the head word-trigger pairings to create entity triplets

    1) Find modifier candidates (deprel == amod, vmod, or nmod)
    2) If a candidate is related to a head word, keep it as the adjective
    
    Parameters:
    text (string): article
    hw (list): list of head word IDs + trigger IDs + relations

    Returns:
    list: 'Triplet' lists of head word, trigger/relation, and attribute 
    [[sent_id, hw_id, trig_id, relation, attr_id]] - e.g., [[1, 4, 6, 'nsubj', 3]]
    """
    
    # get candidate modifiers
    doc = nlp(text)
    candidates = [[sent.id, word.head, word.id] for sent in doc.sentences for word in sent.words if any(_ in word.deprel for _ in ['nmod', 'amod', 'vmod'])]

    # keep only the modifier that is closest per head word
    _ = []
    for a, b, in combinations(candidates, 2):
        if (a[0], a[1]) == (b[0], b[1]) and (abs(a[1]-a[2]) > abs(b[1]-b[2])):
            _.append(a)
        elif (a[0], a[1]) == (b[0], b[1]) and (abs(a[1]-a[2]) == abs(b[1]-b[2])):
            _.append(b)
    candidates = [cand for cand in candidates if cand not in _]    
    
    for w in hw:
        for c in candidates:
            if c[0:2] == w[0:2]:
                w.append(c[2])
    
    return hw
    
    

## 1.4 Add coreference information

In [5]:
# parked for later, mention IDs are added to the text after annotating it with a CoreNLPClient
# simply a matter of figuring out which number exactly we need, and how we are going to 'merge' the corefering mentions

# Testing section

In [6]:
# general purpose testing variables
nlp = Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', verbose=False)  # stanza pipeline
with open('database_dump_drugs/0.json') as file:
    articles = list(set([a['article_content'] for a in load(file)]))
article = articles[0]  # sample article

In [7]:
# testing 1.1
head_words = get_head_words(text=article)
head_words

[[0, 1],
 [0, 7],
 [0, 10],
 [0, 14],
 [0, 16],
 [0, 27],
 [0, 31],
 [0, 37],
 [0, 39],
 [0, 47],
 [0, 51],
 [0, 53],
 [0, 56],
 [0, 61],
 [1, 2],
 [1, 12],
 [1, 15],
 [1, 18],
 [1, 21],
 [1, 27],
 [1, 28],
 [1, 32],
 [1, 37],
 [2, 2],
 [2, 6],
 [2, 8],
 [3, 2],
 [3, 8],
 [3, 12],
 [3, 14],
 [3, 16],
 [3, 23],
 [3, 26],
 [3, 33],
 [3, 36],
 [3, 38],
 [3, 42],
 [3, 48],
 [4, 1],
 [4, 7],
 [4, 15],
 [4, 16],
 [4, 25],
 [5, 2],
 [5, 6],
 [5, 12],
 [6, 9],
 [6, 13],
 [6, 16],
 [6, 18],
 [6, 21],
 [6, 25],
 [6, 29],
 [6, 32],
 [6, 34],
 [6, 36],
 [6, 40],
 [6, 44],
 [6, 47],
 [6, 50],
 [6, 52],
 [6, 61],
 [6, 66],
 [7, 4],
 [7, 10],
 [7, 12],
 [7, 14],
 [7, 17],
 [7, 20],
 [8, 5],
 [8, 11],
 [8, 13],
 [8, 17],
 [8, 20],
 [8, 23],
 [8, 28],
 [8, 31],
 [8, 36],
 [9, 2],
 [9, 5],
 [9, 8],
 [9, 12],
 [9, 14],
 [9, 18],
 [9, 23],
 [9, 26],
 [10, 1],
 [10, 4],
 [10, 6],
 [10, 9],
 [10, 11],
 [10, 14],
 [10, 17],
 [10, 23],
 [10, 30],
 [10, 32],
 [10, 33],
 [10, 39],
 [11, 8],
 [11, 12],
 [11, 17]

In [8]:
# testing 1.2
tr = get_triggers(text=article, hw=head_words)
tr
# does not work properly, see issue #14

[[0, 7, 4, 'obj'],
 [0, 14, 20, 'nsubj'],
 [0, 27, 47, 'nsubj'],
 [1, 2, 3, 'nsubj'],
 [2, 2, 6, 'nsubj'],
 [2, 8, 9, 'nsubj'],
 [3, 2, 4, 'nsubj'],
 [3, 48, 44, 'obj'],
 [4, 1, 2, 'nsubj'],
 [4, 7, 5, 'obj'],
 [5, 2, 10, 'nsubj:pass'],
 [6, 21, 38, 'nsubj'],
 [6, 40, 38, 'obj'],
 [6, 44, 56, 'nsubj:pass'],
 [7, 4, 6, 'nsubj'],
 [7, 10, 8, 'obj'],
 [7, 17, 18, 'nsubj'],
 [7, 20, 18, 'obj'],
 [8, 5, 7, 'nsubj'],
 [8, 36, 33, 'obj'],
 [9, 5, 3, 'nsubj'],
 [9, 8, 13, 'nsubj'],
 [9, 14, 13, 'obj'],
 [9, 26, 27, 'nsubj'],
 [10, 1, 2, 'nsubj'],
 [10, 4, 2, 'obj'],
 [10, 9, 21, 'nsubj'],
 [10, 11, 10, 'obj'],
 [10, 14, 10, 'obj'],
 [10, 23, 21, 'obj'],
 [10, 33, 34, 'nsubj'],
 [11, 8, 10, 'nsubj'],
 [11, 12, 13, 'nsubj'],
 [12, 4, 11, 'nsubj'],
 [12, 13, 11, 'obj'],
 [12, 23, 18, 'obj'],
 [13, 1, 2, 'nsubj'],
 [13, 4, 2, 'obj'],
 [13, 10, 11, 'nsubj'],
 [13, 17, 14, 'obj'],
 [13, 22, 21, 'obj'],
 [13, 26, 29, 'nsubj:pass'],
 [14, 4, 5, 'nsubj'],
 [14, 10, 9, 'obj'],
 [15, 3, 4, 'nsubj'],
 [15

In [15]:
# testing 1.3
triplets = get_attributes(text=article, hw=tr)
triplets

[[0, 3, 2],
 [0, 7, 10],
 [0, 13, 12],
 [0, 14, 13],
 [0, 14, 16],
 [0, 14, 18],
 [0, 20, 21],
 [0, 27, 26],
 [0, 27, 31],
 [0, 37, 34],
 [0, 47, 46],
 [0, 51, 50],
 [0, 51, 53],
 [0, 56, 55],
 [0, 53, 56],
 [1, 6, 5],
 [1, 8, 6],
 [1, 12, 11],
 [1, 15, 14],
 [1, 12, 15],
 [1, 18, 21],
 [1, 27, 26],
 [1, 32, 30],
 [2, 2, 1],
 [2, 2, 5],
 [2, 13, 12],
 [2, 14, 13],
 [3, 8, 7],
 [3, 14, 13],
 [3, 8, 14],
 [3, 23, 20],
 [3, 26, 25],
 [3, 29, 28],
 [3, 23, 29],
 [3, 33, 32],
 [3, 33, 36],
 [3, 36, 38],
 [3, 42, 41],
 [3, 48, 45],
 [4, 2, 3],
 [4, 7, 9],
 [4, 15, 14],
 [5, 2, 6],
 [5, 10, 9],
 [6, 4, 3],
 [6, 6, 4],
 [6, 9, 13],
 [6, 13, 16],
 [6, 29, 28],
 [6, 25, 29],
 [6, 29, 32],
 [6, 32, 34],
 [6, 44, 43],
 [6, 47, 46],
 [6, 44, 47],
 [6, 50, 52],
 [6, 56, 54],
 [6, 60, 59],
 [7, 4, 3],
 [7, 12, 14],
 [8, 7, 2],
 [8, 11, 9],
 [8, 17, 16],
 [8, 20, 19],
 [8, 17, 20],
 [8, 20, 23],
 [8, 36, 34],
 [9, 5, 4],
 [9, 12, 11],
 [9, 18, 23],
 [10, 14, 13],
 [10, 17, 16],
 [10, 14, 17],
 [10, 27