feature extractor

In [5]:
from os import listdir
from xml.dom.minidom import parse
from nltk import pos_tag
import networkx
import argparse
import string
import nltk
# import nltk CoreNLP module (just once)
from nltk.parse.corenlp import CoreNLPDependencyParser
# connect to your CoreNLP server (just once)
corenlp_parser = CoreNLPDependencyParser(url="http://localhost:9000")


def do_indices_overlap(start1, end1, start2, end2):
    if start1 == start2 and end1==end2:
        return True

def find_entity_in_tree(eid, entities, tree):
    start_e1 = int(entities[eid]['offsets'][0])
    end_e1 = int(entities[eid]['offsets'][1].split(';')[0])

    for n in tree.nodes.items():
        node = n[1]
        if node['word'] and (node['start'] == start_e1 or node['end'] == end_e1):
            return node

def find_other_entities(eid1, eid2, sid, entities, tree):
    other_entities = [(entity['eid'], entity['type']) for _, entity in entities.items() if entity['sid'] == sid and entity['eid'] not in [eid1, eid2]]
    return [(find_entity_in_tree(eid, entities, tree),e_type) for eid, e_type in other_entities]

def get_offsets(word, s):
    '''
    Task:
        Given a word and sentence, returns its starting end ending index in the sentence.
    
    Input:
        word: word to find offsets for
        s: sentence containing the word
    
    Output:
        Returns a tuple containing the start and end offset.
    '''
    start = s.find(word)
    end = start + len(word) - 1
    return start, end

def preprocess(s):
    '''
    Task:
        Helper function
    '''
    # because otherwise CoreNLP throws 500
    return s.replace("%", "<percentage>")

def analyze(s):
    '''
    Task:
        Given one sentence, sends it to CoreNLP to obtain the tokens, tags,
        and dependency tree. It also adds the start/end offsets to each token.
    
    Input:
        s: string containing the text for one sentence
    
    Output:
        Returns the nltk DependencyGraph object produced by CoreNLP, enriched with token  offsets.

    '''
    s = s.replace("%", "<percentage>")
    tree, = corenlp_parser.raw_parse(s)
    for n in tree.nodes.items():
        node = n[1]
        if node['word']:
            start, end = get_offsets(node['word'], s)
            node['start'] = start
            node['end'] = end
            
    return tree

CLUE_VERBS = ['administer', 'enhance', 'interact', 'coadminister', 'increase', 'decrease']
NEGATIVE_WORDS = ['No', 'not', 'neither', 'without','lack', 'fail', 'unable', 'abrogate',
                  'absence', 'prevent','unlikely', 'unchanged', 'rarely', 'inhibitor']

def find_clue_verbs(path, tree):
    path_nodes = [tree.nodes[x]['lemma'] for x in path]
    feats = []
    for pn in path_nodes:
        if pn in CLUE_VERBS:
            feats.append('lemmainbetween=%s' % pn)
            
    return feats

def negative_words_path(path, tree):
    path_nodes = [tree.nodes[x]['word'] for x in path]
    count = 0
    for pn in path_nodes:
        if pn in NEGATIVE_WORDS or pn[-3:] == "n't":
            count += 1
    return count

def negative_words_sentence(tree):
    count = 0
    for n in tree.nodes.items():
        word = n[1]['word']
        if word in NEGATIVE_WORDS:
            count += 1
    return count

def traverse_path(path, tree):
    if len(path) == 0:
        return None, None
    path_nodes = [tree.nodes[x] for x in path]
    str_path = ""
    # traverse from e1 up
    current_node = path_nodes[0]
    while (current_node['head'] in path):
        rel = current_node['rel']
        current_node = tree.nodes[current_node['head']]
        str_path += (rel + '<')

    tag_path = str_path + current_node['tag']
    str_path += current_node['lemma']
    # traverse from e2 up
    current_node = path_nodes[-1]
    while(current_node['head'] in path):
        rel = current_node['rel']
        current_node = tree.nodes[current_node['head']]
        str_path += ('>' + rel)
        tag_path += ('>' + rel)

    return str_path, tag_path

def find_words_outside_path(path, tree):
    if len(path) < 1:
        return [], []
    words_before = []
    words_after = []
    nodes_before = [node[1] for node in tree.nodes.items()][:path[0]]
    nodes_after = [node[1] for node in tree.nodes.items()][path[-1]:]

    for node in nodes_before:
        if node['address'] not in path and node['lemma'] and node['lemma'] not in string.punctuation and not node['lemma'].isdigit():
            words_before.append(node['lemma'])
    for node in nodes_after:
        if node['address'] not in path and node['lemma'] and node['lemma'] not in string.punctuation and not node['lemma'].isdigit():
            words_after.append(node['lemma'])
    return words_before, words_after



def find_head(tree, entity):
    for n in tree.nodes.items():
            node = n[1]
            if  node['address'] == entity['head']:
                return node
    
    


In [6]:
def main(datadir):
    pairs = []
    for f in listdir(datadir):
        # parse XML file , obtaining a DOM tree
        tree = parse(datadir + "/" + f)
        # process each sentence in the file
        sentences = tree.getElementsByTagName("sentence")
        
        
        for s in sentences:

            sid = s.attributes["id"].value # get sentence id
            stext = s.attributes["text"].value # get sentence text
            
            # CoreNLP throws error for empty sentences
            if len(stext) == 0:
                continue

            # load sentence ground truth entities
            entities = {}
            ents = s.getElementsByTagName("entity")
            for e in ents:
                eid = e . attributes["id"].value
                entities[eid] = {"offsets": e.attributes["charOffset"].value.split("-"), "type": e.attributes["type"].value, 'sid': sid, 'eid': eid}

            # analyze sentence if there is at least a pair of entities
            if len(entities) > 1:
                analysis = analyze(stext)

            # for each pair in the sentence , decide whether it is DDI and its type
                pairs.append((analysis, entities, s.getElementsByTagName("pair")))
    return pairs



In [7]:
def extract(sentence_pairs, outputf):
    out = open(outputf, 'w')
    for analysis, entities, pairs in sentence_pairs:
        for p in pairs:
            
            # get ground truth
            ddi = p.attributes["ddi"].value
            dditype = p.attributes["type"].value if ddi == "true" else "null"

            # target entities
            id_e1 = p.attributes["e1"].value
            id_e2 = p.attributes["e2"].value

            sid = '.'.join(id_e1.split('.')[:-1])

            # feature extraction
            feats = extract_features(analysis, entities, id_e1, id_e2, sid)

            # resulting feature vector
            out.write('\t'.join([sid, id_e1, id_e2, dditype, "\t".join(feats), '\n']))

In [8]:
traindir = "../../labAHLT/data/train"
testdir = "../../labAHLT/data/test"
develdir = "../../labAHLT/data/devel"



trainpairs = main(traindir)
testpairs = main(testdir)
develpairs = main(develdir)

In [9]:
trainfeats = "feats.dat"
testfeats = "feats_test.dat"
develfeats = "feats_devel.dat"

In [10]:
def extract_features(tree, entities, e1, e2, sid) :
    '''
    Task:
        Given an analyzed sentence and two target entities , compute a feature
        vector for this classification example .
    Input:
        tree: a DependencyGraph object with all sentence information .
        entities: A list of all entities in the sentence (id and offsets).
        e1, e2: ids of the two entities to be checked for an interaction
        sid: sentence id
    Output:
        A vector of binary features .
        Features are binary and vectors are in sparse representation (i.e. only
        active features are listed)
   '''
    int_verbs = ['interact', 'interaction']
    mech_verbs = ['metabolism', 'concentration', 'clearance', 'level', 'absorption', 'dose',
                 'presence', 'interfere']
    adv_verbs = ['co-administration', 'take', 'coadminister', 'treatment', 'therapy', 'tell']
    eff_verbs = ['effect', 'alcohol', 'action','use', 'combination', 'inhibitor',
                'response', 'effect', 'enhance', 'diminish']
      
    e1_node = find_entity_in_tree(e1, entities, tree)
    e2_node = find_entity_in_tree(e2, entities, tree)
    
    e1_head = find_head(tree, e1_node) if e1_node else None
    e2_head = find_head(tree, e2_node) if e2_node else None
    
    h1_lemma = e1_head['lemma'] if e1_node else None
    h2_lemma = e2_head['lemma'] if e2_node else None
    
    tag_head_e1 = e1_head['tag'] if e1_head else None
    tag_head_e2 = e2_head['tag'] if e2_head else None
    
    nxgraph = tree.nx_graph().to_undirected()
    shortest_path = networkx.shortest_path(nxgraph, e1_node['address'], e2_node['address']) if (e1_node and e2_node) else []
    path_with_word, path_with_tag = traverse_path(shortest_path, tree)
    find_clue_verbs(shortest_path, tree)
    count_neg_p = negative_words_path(shortest_path, tree)
    count_neg_s = negative_words_sentence(tree)

    
    # --- FEATURES ---
    features = ['h1_lemma=%s' %h1_lemma,
                'h2_lemma=%s' %h2_lemma,
                'h1_tag=%s' %tag_head_e1,
                'h2_tag=%s' %tag_head_e2,
#                 'path=%s' % path_with_word,
                'tagpath=%s' % path_with_tag,
                # 'neg_words_p=%s' %count_neg_p,  # only 28 with 1, 1 with 2
                'neg_words_s=%s' %count_neg_s,  # 3144 with 1, 270 with 2, 4 with 3 
                'e1_type=%s' % entities[e1]['type'],
                'e2_type=%s' % entities[e2]['type'],
                ] + find_clue_verbs(shortest_path, tree)
    
    
    if (e1_head and e2_head):
        if h1_lemma == h2_lemma:  # should use address? 
            features.append('under_same=True') # 5609 occurrences
            if tag_head_e1[0].lower() == 'v':
                features.append('under_same_verb=True') # 173 occurrences
            else:
                features.append('under_same_verb=False')
        else:
            features.append('under_same=False')
            features.append('under_same_verb=False')
            
        if h1_lemma == e2_node['lemma']:
            features.append('1under2=True') # 136 occ
        else:
            features.append('1under2=False')
            
        if h2_lemma == e1_node['lemma']:
            features.append('2under1=True') # 1953 occ
        else:
            features.append('2under1=False')
        
        if h1_lemma in int_verbs or h2_lemma in int_verbs:
            features.append('intVerbs=True') # 458
        else:
            features.append('intVerbs=False')
            
        if h1_lemma in mech_verbs or h2_lemma in mech_verbs:
            features.append('mechVerbs=True') # 1030
        else:
            features.append('mechVerbs=False')

        if h1_lemma in adv_verbs or h2_lemma in adv_verbs:
            features.append('advVerbs=True') # 569
        else:
            features.append('advVerbs=False')

        if h1_lemma in eff_verbs or h2_lemma in eff_verbs:
            features.append('effVerbs=True') # 3480
        else:
            features.append('effVerbs=False')
        
    else:
        None

    words_before, words_after = find_words_outside_path(shortest_path, tree)
    for word in words_before:
        features.append(f'lemmabefore={word}')
        features.append(f'tagbefore={pos_tag(word)[0][1]}')
    for word in words_after:
        features.append(f'lemmaafter={word}')
        features.append(f'tagafter={pos_tag(word)[0][1]}')

    other_entities = find_other_entities(e1, e2, sid, entities, tree)
    for e_node, e_type in other_entities:
        if e_node and e_node['address'] in shortest_path:
            features.append('entityinbetween=%s' % e_type)
        else:
            features.append('entityother=%s' % e_type)
        
        
    return features



In [11]:
from itertools import groupby

# helper functions for crf-learner and crf-classifier

def parse_string(line):
    split_data = line[:-1].split('\t')
    sentence_id = split_data[0]
    e1_id = split_data[1]
    e2_id = split_data[2]
    interaction = split_data[3]
    feats = split_data[4:]
    features = dict()
    for f in feats:
        features[f] = True
    
    return sentence_id, e1_id, e2_id, features, interaction

def read_feature_file(filepath):
    '''
    Task:
        Given the path to the file containing tokenized sentences, read it and return the necessary data structures
    Input:
        filepath: Path to the data
    Output:
        tokens_by_sentence: list of tuples: (sentence_id, list_of_tokens). Each tuple represents a sentence,
            where in the list_of_tokens each token is represented by a tuple (word, offset_from, offset_to)
        features: list of lists of features per sentence
        tags: list of lists of B-I-O tags per sentence
    '''
    
    f = open(filepath, 'r')
    lines = f.readlines()

    data = []

    for line in lines:
        if len(line) > 1:
            sentence_id, e1, e2, features, interaction = parse_string(line)
            data.append((features, interaction))
    return data

def read_test_feature_file(filepath):
    
    f = open(filepath, 'r')
    lines = f.readlines()

    data = []

    for line in lines:
        if len(line) > 1:
            sentence_id, e1, e2, features, interaction = parse_string(line)
            data.append((sentence_id, e1, e2, features))
    return data


In [12]:

extract(trainpairs, trainfeats)
extract(testpairs, testfeats)
extract(develpairs, develfeats)

learner

In [22]:

train_data = read_feature_file(trainfeats)
test_data = read_test_feature_file(testfeats)
nltk.classify.megam.config_megam('/home/zosia/uni-dev/ahlt/2. DDI/megam_i686.opt')

mymodel = nltk.classify.MaxentClassifier.train(train_data, 'megam', gaussian_prior_sigma=10)

output_f = open('output.dat', 'w')

for sid, e1, e2, feats in test_data:
    prediction = mymodel.classify(feats)
    if prediction != "null":
        # print ( sid , e1 , e2 , prediction , sep ="|")
        output_f.write(f'{sid}|{e1}|{e2}|{prediction}\n')

In [23]:
!python3 evaluator.py DDI ../../labAHLT/data/test output.dat

                   tp	  fp	  fn	#pred	#exp	P	R	F1
------------------------------------------------------------------------------
advise             68	  28	 144	  96	 212	70.8%	32.1%	44.2%
effect            122	  77	 161	 199	 283	61.3%	43.1%	50.6%
int                 3	   3	  15	   6	  18	50.0%	16.7%	25.0%
mechanism         140	  87	 197	 227	 337	61.7%	41.5%	49.6%
------------------------------------------------------------------------------
M.avg            -	-	-	-	-	61.0%	33.3%	42.4%
------------------------------------------------------------------------------
m.avg             333	 195	 517	 528	 850	63.1%	39.2%	48.3%
m.avg(no class)   374	 154	 476	 528	 850	70.8%	44.0%	54.3%


In [24]:
devel_data = read_test_feature_file(develfeats)
output_f = open('output.dat', 'w')

for sid, e1, e2, feats in devel_data:
    prediction = mymodel.classify(feats)
    if prediction != "null":
        # print ( sid , e1 , e2 , prediction , sep ="|")
        output_f.write(f'{sid}|{e1}|{e2}|{prediction}\n')

In [25]:
!python3 evaluator.py DDI ../../labAHLT/data/devel output.dat

                   tp	  fp	  fn	#pred	#exp	P	R	F1
------------------------------------------------------------------------------
advise             63	  42	  75	 105	 138	60.0%	45.7%	51.9%
effect            112	  63	 203	 175	 315	64.0%	35.6%	45.7%
int                28	  12	   7	  40	  35	70.0%	80.0%	74.7%
mechanism          85	  55	 179	 140	 264	60.7%	32.2%	42.1%
------------------------------------------------------------------------------
M.avg            -	-	-	-	-	63.7%	48.4%	53.6%
------------------------------------------------------------------------------
m.avg             288	 172	 464	 460	 752	62.6%	38.3%	47.5%
m.avg(no class)   319	 141	 433	 460	 752	69.3%	42.4%	52.6%
