In [1]:
# imports
import warnings

from json import load
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize
from stanza import Pipeline
from stanza.server import CoreNLPClient

## 1.1 Head word extraction

In [2]:
def get_head_words(text, endpoint='8000'):
    """
    Extracts single-noun head words from an article.

    1) Extracts all noun phrases, tagged with 'NP' by the CoreNLPClient,
    2) Removes multiples (NPs that contain other NPS),
    3) Transforms multi-word NPs into single-word head nouns based on a simple Stanza Pipeline.
    4) Converts the list to a general sentence ID + word ID representation

    Parameters:
    text (string): Article

    Returns:
    list: Head word sentence IDs & word IDs
    """

    # extract all noun phrases from the article
    with CoreNLPClient(properties='corenlp_server-2e15724b8064491b.props', endpoint=f'http://localhost:{endpoint}', memory='8G', be_quiet=True) as client:
        matches = client.tregex(text=text, pattern='NP')
    # reformat the data structure into a list of lists
    noun_phrases = [[text, sent, begin, end] for text, sent, begin, end in
                    zip([sentence[match_id]['spanString'] for sentence in matches['sentences'] for match_id in sentence],
                        [sentence[match_id]['sentIndex'] for sentence in matches['sentences'] for match_id in sentence],
                        [sentence[match_id]['characterOffsetBegin'] for sentence in matches['sentences'] for match_id in sentence],
                        [sentence[match_id]['characterOffsetEnd'] for sentence in matches['sentences'] for match_id in sentence])]

    # remove 'multiples'
    _ = [np1 for np1 in noun_phrases for np2 in noun_phrases if (np1 != np2) and (np1[1] == np2[1]) and ((np1[2] <= np2[2]) and (np1[3] >= np2[3]))]
    noun_phrases = [np for np in noun_phrases if np not in _]

    # convert multi-word noun phrases into single-word head nouns
    doc = nlp(text)
    head_words = [[np[2] + np[0].find(word.text), np[2] + np[0].find(word.text) + len(word.text)] for np in noun_phrases for sent in nlp(np[0]).sentences for word in sent.words if word.deprel == "root"]

    # convert the word indices into sentence ID and word ID pairs
    head_words = [[sent.id, word.id] for sent in doc.sentences for word in sent.words for [i, j] in head_words if word.start_char == i and word.end_char == j]
    
    return head_words

## 1.2 Trigger extraction

In [10]:
def get_triggers(text, hw):
    """
    Extracts head words that have either a verb or an eventive noun as its subject/object/preposition.

    1) Extract all verbs
    2) Get eventive nouns based on the Wordnet Synsets indicated by the authors
    3) Combine 1 and 2 into a list of trigger candidates
    4) Finds all head words that have a trigger as its subject/object/preposition.
    5) Looks for transitive triggers, and extracts the correct subject from the root verb.

    Parameters:
    text (string): article
    head_words (list): list of head word IDs

    Returns:
    list: Triplet lists of head word, trigger, and relation - e.g., [[4, 6, 'nsubj']]
    """

    # parse the text and extract all verbs and lemmas
    doc = nlp(text)
    verbs = [[sent.id, word.id] for sent in doc.sentences for word in sent.words if word.upos == 'VERB']
    lemmas = [[word.lemma, sent.id, word.id] for sent in doc.sentences for word in sent.words]

    # generate a list of eventive nouns
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        wn_evnouns = list(set([w for s in wn.synset('event.n.01').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
        wn_evnouns += list(set([w for s in wn.synset('act.n.02').closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

    # generates a list of trigger candidates based on the verbs and eventive nouns in the text
    candidates = verbs + [[s,w] for [lemma, s, w] in lemmas if lemma in wn_evnouns and [s, w] not in verbs]

    # finds all head word - trigger dyads and their syntactic relation
    triggers = []
    for sent in doc.sentences:
        for word in sent.words:
            if ([sent.id, word.head] in candidates) and ([sent.id, word.id] in hw):
                if ("IN" in word.xpos):
                    triggers.append([sent.id, word.id, word.head, word.xpos])
                elif any(_ in word.deprel for _ in ["subj", "obj"]):
                    triggers.append([sent.id, word.id, word.head, word.deprel])

    for [sent, word] in verbs:
        head = doc.sentences[sent].words[word-1].head
        if head != 0:
            for [_, noun, verb, rel] in triggers:
                if verb == head and ('subj' in rel):
                    triggers.append([sent, noun, word, rel])


    return triggers

## 1.3 Attribute extraction

## 1.4 Add coreference information

# Testing section

In [3]:
# general purpose testing variables
nlp = Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', verbose=False)  # stanza pipeline
with open('database_dump_drugs/0.json') as file:
    articles = list(set([a['article_content'] for a in load(file)]))
article = articles[0]  # sample article

In [5]:
# testing 1.1
head_words = get_head_words(text=article)
head_words

[[0, 2],
 [0, 11],
 [0, 15],
 [0, 17],
 [0, 26],
 [1, 3],
 [1, 8],
 [1, 11],
 [1, 12],
 [1, 15],
 [1, 22],
 [1, 24],
 [1, 27],
 [1, 29],
 [1, 32],
 [1, 36],
 [1, 41],
 [2, 2],
 [2, 5],
 [2, 8],
 [2, 10],
 [3, 2],
 [3, 7],
 [3, 11],
 [3, 13],
 [4, 1],
 [4, 4],
 [4, 7],
 [5, 1],
 [5, 5],
 [5, 8],
 [5, 12],
 [5, 16],
 [5, 18],
 [6, 2],
 [6, 8],
 [6, 10],
 [7, 2],
 [7, 10],
 [7, 15],
 [7, 19],
 [8, 1],
 [8, 5],
 [8, 9],
 [8, 11],
 [8, 14],
 [9, 1],
 [9, 5],
 [9, 7],
 [9, 11],
 [9, 13],
 [10, 3],
 [10, 6],
 [10, 8],
 [10, 11],
 [10, 13],
 [10, 18],
 [10, 23],
 [11, 2],
 [11, 6],
 [11, 9],
 [11, 14],
 [11, 19],
 [11, 21],
 [12, 2],
 [12, 7],
 [12, 11],
 [12, 17],
 [12, 20],
 [12, 22],
 [13, 3],
 [13, 7],
 [13, 11],
 [13, 17],
 [14, 3],
 [14, 6],
 [14, 10],
 [14, 14],
 [14, 16],
 [14, 19],
 [14, 23],
 [14, 29],
 [15, 3],
 [15, 8],
 [15, 10],
 [15, 13],
 [15, 18],
 [15, 22]]

In [11]:
# testing 1.2
tr = get_triggers(text=article, hw=head_words)
tr

[[2, 2, 3, 'nsubj'],
 [3, 2, 5, 'nsubj'],
 [3, 7, 9, 'nsubj'],
 [4, 1, 2, 'nsubj'],
 [5, 1, 3, 'nsubj'],
 [5, 5, 3, 'obj'],
 [5, 8, 11, 'nsubj'],
 [5, 12, 11, 'obj'],
 [5, 18, 14, 'obj'],
 [6, 2, 4, 'nsubj'],
 [6, 8, 6, 'obj'],
 [7, 2, 3, 'nsubj'],
 [7, 15, 16, 'nsubj'],
 [7, 19, 21, 'nsubj'],
 [8, 1, 3, 'nsubj'],
 [8, 5, 3, 'obj'],
 [9, 1, 2, 'nsubj'],
 [10, 3, 5, 'nsubj'],
 [10, 6, 5, 'obj'],
 [10, 13, 16, 'nsubj'],
 [10, 18, 16, 'obj'],
 [10, 23, 25, 'nsubj'],
 [11, 2, 3, 'nsubj'],
 [11, 6, 7, 'nsubj'],
 [11, 9, 7, 'obj'],
 [11, 21, 23, 'nsubj'],
 [12, 20, 18, 'nsubj'],
 [13, 3, 4, 'nsubj'],
 [13, 7, 8, 'nsubj'],
 [13, 11, 8, 'obj'],
 [13, 17, 15, 'obj'],
 [14, 3, 1, 'obj'],
 [14, 16, 18, 'nsubj'],
 [14, 19, 21, 'nsubj'],
 [14, 23, 21, 'obj'],
 [14, 29, 27, 'obj'],
 [15, 3, 4, 'nsubj'],
 [15, 13, 15, 'nsubj'],
 [15, 22, 24, 'nsubj'],
 [0, 6, 27, 'nsubj'],
 [3, 2, 9, 'nsubj'],
 [3, 3, 9, 'nsubj'],
 [5, 2, 11, 'nsubj'],
 [5, 1, 11, 'nsubj'],
 [5, 2, 11, 'nsubj'],
 [5, 1, 11, 'nsubj'],

In [13]:
# for demonstration purposes:
doc = nlp(article)
print(f'Input text: {article}')
for x in range(len(tr)):
    print(f'Head word: {doc.sentences[tr[x][0]].words[tr[x][1]-1].text}\t Trigger: {doc.sentences[tr[x][0]].words[tr[x][2]-1].text}\t Relation: {tr[x][3]}')

Input text: An 18-year-old man was found to be in possession of 16 cannabis deals by Garda, last week’s sitting of Portlaoise District Court heard.
Before the court was Juraj Haenkovic (18) of 5 Carmody Way, Fairgreen, Portlaoise who was charged with possession of drugs, and possession of drugs for unlawful sale or supply on June 24 at 5 Carmody Way.
The court heard the value of the drugs was €352.
Defence solicitor Barry Fitzgerald said the defendant had moved to Ireland 14 years ago. He lives at home with his mother.
Mr Fitzgerald told the court that his client had recently finished school, getting a graded Leaving Certificate.
The accused now hopes to get an apprenticeship in carpentry.
The defendant started using “weed” about a year ago and his “habit escalated”, Mr Fitzgerald said.
Mr Fitzgerald told the court that the peer group the defendant is in uses drugs. He added that the amount of drugs was for the group of friends.
While the defendant still has difficulties in relation to

IndexError: list index out of range