Setup
cd into ../stanford-corenlp-4.2.0 and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer`

In [17]:
from os import listdir
from xml.dom.minidom import parse
# import nltk CoreNLP module (just once)
from nltk.parse.corenlp import CoreNLPDependencyParser
# connect to your CoreNLP server (just once)
corenlp_parser = CoreNLPDependencyParser(url="http://localhost:9000")


# DDI

In [75]:
def get_offsets(word, s):
    '''
    Task:
        Given a word and sentence, returns its starting end ending index in the sentence.
    
    Input:
        word: word to find offsets for
        s: sentence containing the word
    
    Output:
        Returns a tuple containing the start and end offset.
    '''
    start = s.find(word)
    end = start + len(word) - 1
    return start, end

In [97]:
def preprocess(s):
    '''
    Task:
        Helper function
    '''
    # because otherwise CoreNLP throws 500
    return s.replace("%", "<percentage>")

In [109]:
def analyze(s):
    '''
    Task:
        Given one sentence, sends it to CoreNLP to obtain the tokens, tags,
        and dependency tree. It also adds the start/end offsets to each token.
    
    Input:
        s: string containing the text for one sentence
    
    Output:
        Returns the nltk DependencyGraph object produced by CoreNLP, enriched with token  offsets.

    '''
    s = preprocess(s)
    tree, = corenlp_parser.raw_parse(s)
    for n in tree.nodes.items():
        node = n[1]
        if node['word']:
            start, end = get_offsets(node['word'], s)
            node['start'] = start
            node['end'] = end
            
    return tree

In [110]:
# tree = analyze("Interaction between oxytocin and antidiuretic hormone and its effect on the milk secretion by alveoli of the mammary gland of lactating rats.")
# print(tree.nodes.items())
s2 = "Co-administration of oral ketoconazole 200 mg twice daily increased retapamulin geometric mean AUC(0-24) and Cmax by 81% after topical application of retapamulin ointment, 1% on the abraded skin of healthy adult males. "
tree = analyze(s2)
print(tree.nodes.items())

dict_items([(0, {'address': 0, 'word': None, 'lemma': None, 'ctag': 'TOP', 'tag': 'TOP', 'feats': None, 'head': None, 'deps': defaultdict(<class 'list'>, {'ROOT': [1]}), 'rel': None}), (1, {'address': 1, 'word': 'Co-administration', 'lemma': 'co-administration', 'ctag': 'NN', 'tag': 'NN', 'feats': '_', 'head': 0, 'deps': defaultdict(<class 'list'>, {'nmod': [4], 'dep': [13], 'punct': [41]}), 'rel': 'ROOT', 'start': 0, 'end': 16}), (2, {'address': 2, 'word': 'of', 'lemma': 'of', 'ctag': 'IN', 'tag': 'IN', 'feats': '_', 'head': 4, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'case', 'start': 18, 'end': 19}), (4, {'address': 4, 'word': 'ketoconazole', 'lemma': 'ketoconazole', 'ctag': 'NN', 'tag': 'NN', 'feats': '_', 'head': 1, 'deps': defaultdict(<class 'list'>, {'case': [2], 'amod': [3]}), 'rel': 'nmod', 'start': 26, 'end': 37}), (3, {'address': 3, 'word': 'oral', 'lemma': 'oral', 'ctag': 'JJ', 'tag': 'JJ', 'feats': '_', 'head': 4, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'amod

In [111]:
def check_interaction(analysis, entities, e1, e2):
    '''
    Task:
        Decide whether a sentence is expressing a DDI between two drugs.
    
    Input:
        analysis: a DependencyGraph object with all sentence information
        entities: a list of all entities in the sentence (id and offsets)
        e1, e2: ids of the two entities to be checked
    
    Output:
        Returns the type of interaction ('effect', 'mechanism', 'advice', 'int') between e1 and e2
        expressed by the sentence, or 'None' if no interaction is described.
    '''

    return None

In [112]:
outf = "output.txt"
datadir = "../../labAHLT/data/devel"

# process each file in directory
for f in listdir (datadir):
    # parse XML file , obtaining a DOM tree
    tree = parse ( datadir + "/" + f)
    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    for s in sentences:

        sid = s.attributes ["id"].value # get sentence id
        stext = s.attributes ["text"].value # get sentence text
        
        # CoreNLP throws error for empty sentences
        if len(stext) == 0:
            continue

        # load sentence entities into a dictionary
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            eid = e . attributes ["id"].value
            entities[eid] = e.attributes["charOffset"].value.split("-")

        # Tokenize, tag, and parse sentence
        analysis = analyze(stext)

        # for each pair in the sentence , decide whether it is DDI and its type
        pairs = s.getElementsByTagName("pair")
        for p in pairs:
            id_e1 = p.attributes["e1"].value
            id_e2 = p.attributes["e2"].value
            ddi_type = check_interaction(analysis, entities , id_e1 , id_e2 )
            if ddi_type != None :
                print (sid +"|"+ id_e1 +"|"+ id_e2 +"|"+ ddi_type, file = outf )

                
                

In [108]:
#StanfordCoreNLP throws error 500 when a sentence contains '%':
#    Illegal hex characters in escape (%) pattern - Error at index 0 in: " a"
#  java.base/java.net.URLDecoder.decode(URLDecoder.java:232)
#  java.base/java.net.URLDecoder.decode(URLDecoder.java:142)
