# DDI-ML. Relation Extraction

Setup
cd into ../stanford-corenlp-4.2.0 and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer`

In [1]:
from os import listdir
from xml.dom.minidom import parse
import networkx
# import nltk CoreNLP module (just once)
from nltk.parse.corenlp import CoreNLPDependencyParser
# connect to your CoreNLP server (just once)
corenlp_parser = CoreNLPDependencyParser(url="http://localhost:9000")

from evaluator import *

## Feature Extractor

In [2]:
def get_offsets(word, s):
    '''
    Task:
        Given a word and sentence, returns its starting end ending index in the sentence.
    
    Input:
        word: word to find offsets for
        s: sentence containing the word
    
    Output:
        Returns a tuple containing the start and end offset.
    '''
    start = s.find(word)
    end = start + len(word) - 1
    return start, end

In [3]:
def preprocess(s):
    '''
    Task:
        Helper function
    '''
    # because otherwise CoreNLP throws 500
    return s.replace("%", "<percentage>")

In [4]:
def analyze(s):
    '''
    Task:
        Given one sentence, sends it to CoreNLP to obtain the tokens, tags,
        and dependency tree. It also adds the start/end offsets to each token.
    
    Input:
        s: string containing the text for one sentence
    
    Output:
        Returns the nltk DependencyGraph object produced by CoreNLP, enriched with token  offsets.

    '''
    s = s.replace("%", "<percentage>")
    tree, = corenlp_parser.raw_parse(s)
    for n in tree.nodes.items():
        node = n[1]
        if node['word']:
            start, end = get_offsets(node['word'], s)
            node['start'] = start
            node['end'] = end
            
    return tree

In [5]:
def find_entity_in_tree(eid, entities, tree):
    start_e1 = entities[eid][0]
    end_e1 = entities[eid][1]
    
    for n in tree.nodes.items():
        node = n[1]
        if node['word'] and node['start'] == int(start_e1): # and node['end'] == int(end_e1)):
            return node
    
    # TODO: handle two-word entities like "beta-endorphin"
    # We can extract here word, lemma, tag

In [6]:
def find_head(tree, entity):
    for n in tree.nodes.items():
            node = n[1]
            if  node['address'] == entity['head']:
                return node
    # We can extract here word, lemma, tag

In [7]:
# file for initial checks
file = '../../labAHLT/data/train/Dexamethasone_ddi.xml'
tree = parse(file)
sentences = tree.getElementsByTagName("sentence")
for s in sentences:
    sid = s.attributes["id"].value
    stext = s.attributes["text"].value
    
    entities = {}
    ents = s.getElementsByTagName("entity")
    for e in ents:
        eid = e . attributes["id"].value
        entities[eid] = e.attributes["charOffset"].value.split("-")
    if len(entities) > 1: analysis = analyze(stext)
    
    pairs = s.getElementsByTagName("pair")
    for p in pairs:
        # get ground truth
        ddi = p.attributes["ddi"].value
        dditype = p.attributes["type"].value if ddi == "true" else "null"
            
        # target entities
        id_e1 = p.attributes["e1"].value
        id_e2 = p.attributes["e2"].value

In [48]:
def traverse_path(path, tree):
    if len(path) == 0:
        return None
    path_nodes = [tree.nodes[x] for x in path]
    str_path = ""
    # traverse from e1 up
    current_node = path_nodes[0]
    while (current_node['head'] in path):
        
        rel = current_node['rel']
        current_node = tree.nodes[current_node['head']]
        str_path += (rel + '<')
    
    str_path += current_node['lemma'] 
    # traverse from e2 up
    current_node = path_nodes[-1]
    while(current_node['head'] in path):
        rel = current_node['rel']
        current_node = tree.nodes[current_node['head']]
        str_path += ('>' + rel)
        
    return str_path
    

In [49]:
CLUE_VERBS = ['administer', 'enhance', 'interact', 'coadminister', 'increase', 'decrease'] # add more?

def find_clue_verbs(path, tree):
    path_nodes = [tree.nodes[x]['lemma'] for x in path]
    feats = []
    for pn in path_nodes:
        if pn in CLUE_VERBS:
            feats.append('lemmainbetween=%s' % pn)
            
    return feats

def negative_words(path, tree):
    negative_words = ['No', 'not', 'neither', 'without', 'lack', 'fail', 'unable', 'abrogate',
                      'absence', 'prevent','unlikely', 'unchanged', 'rarely']
    path_nodes = [tree.nodes[x]['word'] for x in path]
    count = 0
    for pn in path_nodes:
        if pn in negative_words or pn[-3:] == "n't":
            count += 1
    return count

In [50]:
def extract_features(tree, entities, e1, e2) :
    '''
    Task:
        Given an analyzed sentence and two target entities , compute a feature
        vector for this classification example .
    Input:
        tree: a DependencyGraph object with all sentence information .
        entities: A list of all entities in the sentence (id and offsets).
        e1, e2: ids of the two entities to be checked for an interaction
    Output:
        A vector of binary features .
        Features are binary and vectors are in sparse representation (i.e. only
        active features are listed)
   '''
      
    e1_node = find_entity_in_tree(e1, entities, tree)
    e2_node = find_entity_in_tree(e2, entities, tree)
    
    e1_head = find_head(tree, e1_node) if e1_node else None
    e2_head = find_head(tree, e2_node) if e2_node else None
    
    h1_lemma = e1_head['lemma'] if e1_node else None
    h2_lemma = e2_head['lemma'] if e2_node else None
    
    tag_head_e1 = e1_head['tag'] if e1_head else None
    tag_head_e2 = e2_head['tag'] if e2_head else None
    
    nxgraph = tree.nx_graph().to_undirected()
    shortest_path = networkx.shortest_path(nxgraph, e1_node['address'], e2_node['address']) if (e1_node and e2_node) else []
    path = traverse_path(shortest_path, analysis)
    find_clue_verbs(shortest_path, analysis)
    
    # --- FEATURES ---
    features = ['h1_lemma=%s' %h1_lemma,
                'h2_lemma=%s' %h2_lemma,
                'h1_tag=%s' %tag_head_e1,
                'h2_tag=%s' %tag_head_e2,
                'path=%s' % path,
                'negative_words=%s' %count_neg  # only 28 with 1, 1 with 2
                ] + find_clue_verbs(shortest_path, analysis)
    
    
    if (e1_head and e2_head):
        if h1_lemma == h2_lemma:  # should use address? 
            features.append('under_same=True') # 5609 occurrences
            if tag_head_e1[0].lower() == 'v':
                features.append('under_same_verb=True') # 173 occurrences
            else:
                features.append('under_same_verb=False')
        else:
            features.append('under_same=False')
            features.append('under_same_verb=False')
            
        if h1_lemma == e2_node['lemma']:
            features.append('1under2=True') # 136 occ
        else:
            features.append('1under2=False')
            
        if h2_lemma == e1_node['lemma']:
            features.append('2under1=True') # 1953 occ
        else:
            features.append('2under1=False')
    else:
        None
        
    return features

In [51]:
# file for initial checks
file = '../../labAHLT/data/train/Dexamethasone_ddi.xml'
tree = parse(file)
sentences = tree.getElementsByTagName("sentence")
for s in sentences:
    sid = s.attributes["id"].value
    stext = s.attributes["text"].value
    
    entities = {}
    ents = s.getElementsByTagName("entity")
    for e in ents:
        eid = e . attributes["id"].value
        entities[eid] = e.attributes["charOffset"].value.split("-")
    if len(entities) > 1: analysis = analyze(stext)
    
    pairs = s.getElementsByTagName("pair")
    for p in pairs:
        # get ground truth
        ddi = p.attributes["ddi"].value
        dditype = p.attributes["type"].value if ddi == "true" else "null"
            
        # target entities
        id_e1 = p.attributes["e1"].value
        id_e2 = p.attributes["e2"].value
        extract_features(analysis, entities, id_e1, id_e2)

In [55]:
datadir = "../../labAHLT/data/train"

outfile = "devel_feats.dat"

# process each file in directory
for f in listdir(datadir):
    # parse XML file , obtaining a DOM tree
    tree = parse(datadir + "/" + f)
    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    for s in sentences:

        sid = s.attributes["id"].value # get sentence id
        stext = s.attributes["text"].value # get sentence text
        
        # CoreNLP throws error for empty sentences
        if len(stext) == 0:
            continue

        # load sentence ground truth entities
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            eid = e . attributes["id"].value
            entities[eid] = e.attributes["charOffset"].value.split("-")

        # analyze sentence if there is at least a pair of entities
        if len(entities) > 1: analysis = analyze(stext)

        # for each pair in the sentence , decide whether it is DDI and its type
        pairs = s.getElementsByTagName("pair")
        for p in pairs:
            # get ground truth
            ddi = p.attributes["ddi"].value
            dditype = p.attributes["type"].value if ddi == "true" else "null"
            
            # target entities
            id_e1 = p.attributes["e1"].value
            id_e2 = p.attributes["e2"].value
            
            # feature extraction
            feats = extract_features(analysis, entities, id_e1, id_e2)
            
            # resulting feature vector
            print(sid, id_e1, id_e2, dditype, "|".join(feats), sep="|") 
            

DDI-DrugBank.d481.s0|DDI-DrugBank.d481.s0.e0|DDI-DrugBank.d481.s0.e1|mechanism|h1_lemma=rich|h2_lemma=absorption|h1_tag=JJ|h2_tag=NN|path=obl:npmod<amod<conj<nsubj<impair>nmod>obj
DDI-DrugBank.d10.s1|DDI-DrugBank.d10.s1.e0|DDI-DrugBank.d10.s1.e1|null|h1_lemma=treatment|h2_lemma=corticosteroid|h1_tag=NN|h2_tag=NN|path=corticosteroid>conj
DDI-DrugBank.d10.s1|DDI-DrugBank.d10.s1.e0|DDI-DrugBank.d10.s1.e2|null|h1_lemma=treatment|h2_lemma=receive|h1_tag=NN|h2_tag=VBG|path=compound<nsubj:pass<administer>obj>dep|lemmainbetween=administer
DDI-DrugBank.d10.s1|DDI-DrugBank.d10.s1.e1|DDI-DrugBank.d10.s1.e2|null|h1_lemma=corticosteroid|h2_lemma=receive|h1_tag=NN|h2_tag=VBG|path=conj<compound<nsubj:pass<administer>obj>dep|lemmainbetween=administer
DDI-DrugBank.d10.s2|DDI-DrugBank.d10.s2.e0|DDI-DrugBank.d10.s2.e1|mechanism|h1_lemma=administration|h2_lemma=elimination|h1_tag=NN|h2_tag=NN|path=compound<nsubj<lead>amod>nmod>obl
DDI-DrugBank.d10.s2|DDI-DrugBank.d10.s2.e0|DDI-DrugBank.d10.s2.e2|null|h1_l

## Learner

In [56]:
from itertools import groupby

In [57]:
def parse_string(line):
    split_data = line[:-1].split('|')
    print(split_data)
    sentence_id = split_data[0]
    e1_id = split_data[1]
    e2_id = split_data[2]
    interaction = split_data[3]
    features = split_data[4:]
    return e1_id, e2_id, features, interaction

In [58]:
def read_feature_file(filepath):
    
    f = open(filepath, 'r')
    lines = f.readlines()

    data = []

    for line in lines:
        e1, e2, features, interaction = parse_string(line)
        data.append((features, interaction))
    return data

In [62]:
#read_feature_file("./train_feats.txt")