# DDI-ML. Relation Extraction

Setup
cd into ../stanford-corenlp-4.2.0 and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer`

In [1]:
from os import listdir
from xml.dom.minidom import parse
# import nltk CoreNLP module (just once)
from nltk.parse.corenlp import CoreNLPDependencyParser
# connect to your CoreNLP server (just once)
corenlp_parser = CoreNLPDependencyParser(url="http://localhost:9000")

from evaluator import *

## Feature Extractor

In [5]:
def get_offsets(word, s):
    '''
    Task:
        Given a word and sentence, returns its starting end ending index in the sentence.
    
    Input:
        word: word to find offsets for
        s: sentence containing the word
    
    Output:
        Returns a tuple containing the start and end offset.
    '''
    start = s.find(word)
    end = start + len(word) - 1
    return start, end

In [6]:
def preprocess(s):
    '''
    Task:
        Helper function
    '''
    # because otherwise CoreNLP throws 500
    return s.replace("%", "<percentage>")

In [7]:
def analyze(s):
    '''
    Task:
        Given one sentence, sends it to CoreNLP to obtain the tokens, tags,
        and dependency tree. It also adds the start/end offsets to each token.
    
    Input:
        s: string containing the text for one sentence
    
    Output:
        Returns the nltk DependencyGraph object produced by CoreNLP, enriched with token  offsets.

    '''
    s = s.replace("%", "<percentage>")
    tree, = corenlp_parser.raw_parse(s)
    for n in tree.nodes.items():
        node = n[1]
        if node['word']:
            start, end = get_offsets(node['word'], s)
            node['start'] = start
            node['end'] = end
            
    return tree

In [152]:
def find_entity_in_tree(eid, entities, tree):
    start_e1 = entities[eid][0]
    end_e1 = entities[eid][1]
    
    for n in tree.nodes.items():
        node = n[1]
        if node['word'] and (node['start'] == int(start_e1) and node['end'] == int(end_e1)):
            return node
    
    # TODO: handle two-word entities like "beta-endorphin"
    # We can extract here word, lemma, tag

In [154]:
def find_head(tree, entity):
    for n in tree.nodes.items():
            node = n[1]
            if  node['address'] == entity['head']:
                return node
    # We can extract here word, lemma, tag

In [157]:
e = find_entity_in_tree(id_e2, entities, analysis)
h = find_head(analysis, e)
print(e['word'])
print(e)
print(h)

morphine
{'address': 11, 'word': 'morphine', 'lemma': 'morphine', 'ctag': 'NN', 'tag': 'NN', 'feats': '_', 'head': 9, 'deps': defaultdict(<class 'list'>, {'cc': [10]}), 'rel': 'conj', 'start': 57, 'end': 64}
{'address': 9, 'word': 'endorphin', 'lemma': 'endorphin', 'ctag': 'JJ', 'tag': 'JJ', 'feats': '_', 'head': 16, 'deps': defaultdict(<class 'list'>, {'obl:npmod': [7], 'punct': [8, 12], 'conj': [11], 'advmod': [13]}), 'rel': 'ccomp', 'start': 43, 'end': 51}


In [77]:
# file for initial checks
file = '/Users/mponsclo/Documents/Master/labAHLT/data/train/3155550.xml'
tree = parse(file)
sentences = tree.getElementsByTagName("sentence")
for s in sentences:
    sid = s.attributes["id"].value
    stext = s.attributes["text"].value
    
    entities = {}
    ents = s.getElementsByTagName("entity")
    for e in ents:
        eid = e . attributes["id"].value
        entities[eid] = e.attributes["charOffset"].value.split("-")
    if len(entities) > 1: analysis = analyze(stext)
    
    pairs = s.getElementsByTagName("pair")
    for p in pairs:
        # get ground truth
        ddi = p.attributes["ddi"].value
        dditype = p.attributes["type"].value if ddi == "true" else "null"
            
        # target entities
        id_e1 = p.attributes["e1"].value
        id_e2 = p.attributes["e2"].value

print(entities)
print(entities[id_e1][0])
print(analysis)

{'DDI-MedLine.d63.s8.e0': ['38', '51'], 'DDI-MedLine.d63.s8.e1': ['57', '64']}
38
defaultdict(<function DependencyGraph.__init__.<locals>.<lambda> at 0x7ff42c945310>,
            {0: {'address': 0,
                 'ctag': 'TOP',
                 'deps': defaultdict(<class 'list'>, {'ROOT': [1]}),
                 'feats': None,
                 'head': None,
                 'lemma': None,
                 'rel': None,
                 'tag': 'TOP',
                 'word': None},
             1: {'address': 1,
                 'ctag': 'VBG',
                 'deps': defaultdict(<class 'list'>,
                                     {'obj': [2],
                                      'punct': [17]}),
                 'end': 9,
                 'feats': '_',
                 'head': 0,
                 'lemma': 'descend',
                 'rel': 'ROOT',
                 'start': 0,
                 'tag': 'VBG',
                 'word': 'descending'},
             2: {'address': 2,
      

In [162]:
def extract_features(tree, entities, e1, e2) :
    '''
    Task:
        Given an analyzed sentence and two target entities , compute a feature
        vector for this classification example .
    Input:
        tree: a DependencyGraph object with all sentence information .
        entities: A list of all entities in the sentence (id and offsets).
        e1, e2: ids of the two entities to be checked for an interaction
    Output:
        A vector of binary features .
        Features are binary and vectors are in sparse representation (i.e. only
        active features are listed)
   '''
        
    e1_node = find_entity_in_tree(e1, entities, tree)
    e2_node = find_entity_in_tree(e2, entities, tree)
    
    e1_head = find_head(tree, e1_node) if e1_node else None
    e2_head = find_head(tree, e2_node) if e2_node else None
    
    h1_lemma = e1_head['lemma'] if e1_node else None
    h2_lemma = e2_head['lemma'] if e2_node else None

    
    #e1_tag = e1_node['tag'] if e1_node else None
    #e2_tag = e2_node['tag'] if e2_node else None
    
    #h_e1 = e1_node['head'] if e1_node else None      
    #l_h1 = analysis.nodes[h_e1]['lemma'] if e1_node else None
    #w_h1 = analysis.nodes[h_e1]['word'] if e1_node else None
    #tag_head_e1 = analysis.nodes[h_e1]['tag'][0].lower() if e1_node else None
    
    #h_e2 = e2_node['head'] if e2_node else None
    #l_h2 = analysis.nodes[h_e2]['lemma'] if e2_node else None
    #tag_head_e2 = analysis.nodes[h_e2]['tag'][0].lower() if e2_node else None
    
    # --- FEATURES ---
    features = ['h1_lemma=%s' %h1_lemma,
                #'h1_word=%s' %e1_head['word'],
                'h2_lemma=%s' %h2_lemma,
                #'h2_word=%s' %e1_head['word']
                ]
    
    print(features)


In [163]:
extract_features(analysis, entities, id_e1, id_e2)

['h1_lemma=None', 'h2_lemma=endorphin']


In [None]:
datadir = "../../labAHLT/data/devel"

# process each file in directory
for f in listdir(datadir):
    # parse XML file , obtaining a DOM tree
    tree = parse(datadir + "/" + f)
    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    for s in sentences:

        sid = s.attributes["id"].value # get sentence id
        stext = s.attributes["text"].value # get sentence text
        
        # CoreNLP throws error for empty sentences
        if len(stext) == 0:
            continue

        # load sentence ground truth entities
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            eid = e . attributes["id"].value
            entities[eid] = e.attributes["charOffset"].value.split("-")

        # analyze sentence if there is at least a pair of entities
        if len(entities) > 1: analysis = analyze(stext)

        # for each pair in the sentence , decide whether it is DDI and its type
        pairs = s.getElementsByTagName("pair")
        for p in pairs:
            # get ground truth
            ddi = p.attributes["ddi"].value
            dditype = p.attributes["type"].value if ddi == "true" else "null"
            
            # target entities
            id_e1 = p.attributes["e1"].value
            id_e2 = p.attributes["e2"].value
            
            # feature extraction
            feats = extract_features(analysis, entities, id_e1, id_e2)
            
            # resulting feature vector
            print(sid, id_e1, id_e2, dditype, "\t".join(feats), sep="\t")    