Setup
cd into ../stanford-corenlp-4.2.0 and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer`

In [53]:
from os import listdir
from xml.dom.minidom import parse
# import nltk CoreNLP module (just once)
from nltk.parse.corenlp import CoreNLPDependencyParser
# connect to your CoreNLP server (just once)
corenlp_parser = CoreNLPDependencyParser(url="http://localhost:9000")

from evaluator import *

# DDI

In [54]:
def get_offsets(word, s):
    '''
    Task:
        Given a word and sentence, returns its starting end ending index in the sentence.
    
    Input:
        word: word to find offsets for
        s: sentence containing the word
    
    Output:
        Returns a tuple containing the start and end offset.
    '''
    start = s.find(word)
    end = start + len(word) - 1
    return start, end

In [55]:
def preprocess(s):
    '''
    Task:
        Helper function
    '''
    # because otherwise CoreNLP throws 500
    return s.replace("%", "<percentage>")

In [56]:
def analyze(s):
    '''
    Task:
        Given one sentence, sends it to CoreNLP to obtain the tokens, tags,
        and dependency tree. It also adds the start/end offsets to each token.
    
    Input:
        s: string containing the text for one sentence
    
    Output:
        Returns the nltk DependencyGraph object produced by CoreNLP, enriched with token  offsets.

    '''
    s = s.replace("%", "<percentage>")
    tree, = corenlp_parser.raw_parse(s)
    for n in tree.nodes.items():
        node = n[1]
        if node['word']:
            start, end = get_offsets(node['word'], s)
            node['start'] = start
            node['end'] = end
            
    return tree

In [57]:
def find_entity_in_tree(entity, tree):
    for n in tree.nodes.items():
        node = n[1]
        try:
            if node["word"] and (node["start"] == int(entity[0])) and (node["end"] == int(entity[1])):
                return node
        except:
            continue
    # PROBLEM: Two-word entities?
    # ValueError: invalid literal for int() with base 10: '34;50'
    return None

In [78]:
def check_interaction(analysis, entities, e1, e2):
    '''
    Task:
        Decide whether a sentence is expressing a DDI between two drugs.
    
    Input:
        analysis: a DependencyGraph object with all sentence information
        entities: a list of all entities in the sentence (id and offsets)
        e1, e2: ids of the two entities to be checked
    
    Output:
        Returns the type of interaction ('effect', 'mechanism', 'advice', 'int') between e1 and e2
        expressed by the sentence, or 'None' if no interaction is described.
    '''
    tree = analysis.tree()
    
    entity1 = entities[e1]
    entity2 = entities[e2]
    
    e1_node = find_entity_in_tree(entity1, analysis)
    e2_node = find_entity_in_tree(entity2, analysis)
    
    e1_tag = e1_node['tag'] if e1_node else None
    e2_tag = e2_node['tag'] if e2_node else None
    
    h_e1 = e1_node['head'] if e1_node else None      # position of the head
    head_e1 = analysis.nodes[h_e1]['lemma'] if e1_node else None
    tag_head_e1 = analysis.nodes[h_e1]['tag'][0].lower() if e1_node else None
    
    h_e2 = e2_node['head'] if e2_node else None      # position of the head
    head_e2 = analysis.nodes[h_e2]['lemma'] if e2_node else None
    tag_head_e2 = analysis.nodes[h_e2]['tag'][0].lower() if e2_node else None
    
    if head_e1 == head_e2 and tag_head_e1 == 'v' and tag_head_e2 == 'v':
        under_same_verb = True
    else:
        under_same_verb = False
        
    if head_e1 == head_e2 and head_e1 != None and head_e2 != None:
        under_same_word = True
    else:
        under_same_word = False
    
    if head_e1 == entity2 and head_e1 != None:
        e1_under_e2 = True
    else: 
        e1_under_e2 = False
    
    if head_e2 == entity1 and head_e2 != None:
        e2_under_e1 = True
    else:
        e2_under_e1 = False
        
    # --- RULES ---
    if head_e1  == 'response' and head_e2 in ['man', 'alcohol', 'steroid']:
        return 'effect'
    if head_e1 == 'administer' and under_same_word:
        return 'advise'
    if head_e1 in ['response', 'effect', 'enhance', 'diminish']:
        return 'effect'
    if head_e1 in ['concentration', 'presence', 'dose', 'absorption', 'interfere']:
        return 'mechanism'
    if head_e1 in ['interact', 'interaction']: # removed action & agent -> improved 2.7% points
        return 'int'
    if head_e1 in ['co-administration', 'take', 'coadminister', 'treatment', 'therapy', 'tell']:
        return 'advise'
    if head_e2 in ['effect', 'alcohol', 'action', 'use', 'combination', 'inhibitor']:
        return 'effect'
    if head_e2 in ['metabolism', 'concentration', 'clearance', 'level', 'absorption', 'dose']:
        return 'mechanism'
    if head_e2 in ['interact', 'interaction']:
        return 'int'
    #if under_same_word:
     #   return 'mechanism' # or "int"  ## --> removing improves f1 to 4.5% points
    #if e1_under_e2:
     #   return 'effect'
    

    return None

In [59]:
datadir = "../../labAHLT/data/devel"

# process each file in directory
for f in listdir (datadir):
    # parse XML file , obtaining a DOM tree
    tree = parse ( datadir + "/" + f)
    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    for s in sentences:

        sid = s.attributes ["id"].value # get sentence id
        stext = s.attributes ["text"].value # get sentence text
        
        # CoreNLP throws error for empty sentences
        if len(stext) == 0:
            continue

        # load sentence entities into a dictionary
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            eid = e . attributes ["id"].value
            entities[eid] = e.attributes["charOffset"].value.split("-")

        # Tokenize, tag, and parse sentence
        analysis = analyze(stext)

        # for each pair in the sentence , decide whether it is DDI and its type
        pairs = s.getElementsByTagName("pair")
        for p in pairs:
            id_e1 = p.attributes["e1"].value
            id_e2 = p.attributes["e2"].value
            ddi_type = check_interaction(analysis, entities , id_e1 , id_e2 )
            if ddi_type != None:
                pass
                #print(sid +"|"+ id_e1 +"|"+ id_e2 +"|"+ ddi_type)
         

In [79]:
datadir = "../../labAHLT/data/devel"
outf = open('results.txt', "w")

# process each file in directory
for f in listdir(datadir):
    
    # parse XML file , obtaining a DOM tree
    tree = parse(datadir + "/" + f)
    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    for s in sentences:

        sid = s.attributes["id"].value # get sentence id
        stext = s.attributes["text"].value # get sentence text
        
        # CoreNLP throws error for empty sentences
        if len(stext) == 0:
            continue

        # load sentence entities into a dictionary
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            eid = e.attributes["id"].value
            entities[eid] = e.attributes["charOffset"].value.split("-")

        # Tokenize, tag, and parse sentence
        analysis = analyze(stext)

        # for each pair in the sentence , decide whether it is DDI and its type
        pairs = s.getElementsByTagName("pair")
        for p in pairs:
            id_e1 = p.attributes["e1"].value
            id_e2 = p.attributes["e2"].value
            ddi_type = check_interaction(analysis, entities, id_e1, id_e2)
            if ddi_type != None:
                entity = sid +"|"+ id_e1 +"|"+ id_e2 +"|"+ ddi_type
            outf.write(entity + '\n')
outf.close()

In [80]:
# TODO: Check meaning -> Ignoring duplicated entity in system predictions file: ' + line

evaluate('DDI', datadir, 'results.txt')

Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-DrugBank.d14.s0.e1|effect
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-DrugBank.d14.s0.e1|effect
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-DrugBank.d14.s0.e1|effect
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-DrugBank.d14.s0.e1|effect
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-DrugBank.d14.s0.e1|effect
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-DrugBank.d14.s0.e1|effect
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-DrugBank.d14.s0.e1|effect
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d14.s0|DDI-DrugBank.d14.s0.e0|DDI-Dr

Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-DrugBank.d81.s11.e4|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-DrugBank.d81.s11.e4|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-DrugBank.d81.s11.e4|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-DrugBank.d81.s11.e4|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-DrugBank.d81.s11.e4|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-DrugBank.d81.s11.e4|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-DrugBank.d81.s11.e4|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d81.s11|DDI-DrugBank.d81.s11.e1|DDI-

Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.e1|DDI-DrugBank.d446.s0.e13|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.e1|DDI-DrugBank.d446.s0.e13|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.e1|DDI-DrugBank.d446.s0.e13|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.e1|DDI-DrugBank.d446.s0.e13|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.e1|DDI-DrugBank.d446.s0.e13|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.e1|DDI-DrugBank.d446.s0.e13|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.e1|DDI-DrugBank.d446.s0.e13|int
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d446.s0|DDI-DrugBank.d446.s0.

In [60]:
#StanfordCoreNLP throws error 500 when a sentence contains '%':
#    Illegal hex characters in escape (%) pattern - Error at index 0 in: " a"
#  java.base/java.net.URLDecoder.decode(URLDecoder.java:232)
#  java.base/java.net.URLDecoder.decode(URLDecoder.java:142)