<a href="https://colab.research.google.com/github/zoftar/ahlt/blob/main/3.%20NN/NER-NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Neural Network NER

In [2]:
import numpy as np
from os import listdir
import pickle
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.corpus import stopwords

# import tensorflow_addons as tfa
from tensorflow.keras.optimizers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras import Model, Input, Sequential
from keras.layers import *
# from keras_contrib.layers import CRF
#from keras_crf import CRFModel


stopwords = set(stopwords.words("english"))
from evaluator import *

%matplotlib inline
# %tensorflow_version 1.x

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Functions Learner

In [3]:
def find_multi_token_entity(ents, token):
    keys = ents.keys()
    for k in keys:
        if token in k:
            return ents[k]

def load_data(datadir):
    '''
    Task: Load XML files in given directory, tokenize each sentence, and extract
    ground truth BIO labels for each token.
    
    Input: 
        datadir: A directory containing XML files
        
    Output: A directory containing the dataset. Dictionary key is sentence_id, and the 
            value is a list of token tuples (word, start, end, ground truth).
            
    Example: 
            >>> load_data('data/Train')
            {'DDI-DrugBank.d370.s0': [('as', 0, 1,'O'), ('differin', 3, 10,'B-brand'),
                     ('gel', 12, 14,'O'), ... , ('with', 343, 346, 'O'),
                     ('caution', 348, 354, 'O'), ('.', 355, 355, 'O')],
            'DDI-DrugBank.d370.s1': [('particular', 0, 9, 'O'), ('caution', 11, 17, 'O'),
                     ('should', 19, 24, 'O'), ... , ('differin', 130, 137, 'B-brand'),
                     ('gel', 139, 141, 'O'), ('.', 142, 142, 'O')], ... }
    '''
    
    dict_dataset = {}
    
    for f in listdir(datadir):
        tree = parse(datadir + "/" + f)
        
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences: 
            sid = s.attributes["id"].value
            
            stext = s.attributes["text"].value
            #stext = stext.replace("-"," ") if used we lose beta-endorphin, if not use we lose calcium-rich tag
            
            ents = {}
            entities = s.getElementsByTagName("entity")
            for e in entities:
                offset = e.attributes["charOffset"].value
                offsets = offset.split(';')
                name = e.attributes["text"].value
                e_type = e.attributes["type"].value
                ents[name] = {"type": e_type, "offsets": []}
                for offset in offsets:
                    start = offset.split('-')[0]
                    end = offset.split('-')[1]
                    ents[name]["offsets"].append((start, end))
            
            punct = [",",";",":","?","!", "(", ")"] # removed "."
            tokens = word_tokenize(stext)
            tokens_cleaned = []
            for t in tokens:
                if t not in punct and t not in stopwords:
                    tokens_cleaned.append(t)
            
            tags = []
            tokens = []
            for t in tokens_cleaned:
                offsetFrom = stext.find(t)
                offsetTo = offsetFrom + len(t) - 1
                # 1-token entities
                if t in ents:
                    if (int(ents[t]["offsets"][0][0]) == offsetFrom):
                        tag = "B-"+ents[t]["type"] # TODO: ents after .?
                    else:
                        tag = "I-"+ents[t]["type"]                    
                else:
                    multi_token_ent = find_multi_token_entity(ents, t)
                    if multi_token_ent:
                        if (int(multi_token_ent["offsets"][0][0]) == offsetFrom):
                            tag = "B-"+multi_token_ent["type"] # TODO: ents after .?
                        else:
                            tag = "I-"+multi_token_ent["type"]
                    else:
                        tag = "O"
                tags.append(tag)
                tupl = (t, offsetFrom, offsetTo, tag)
                tokens.append(tupl)
            
            dict_dataset[sid] = tokens
        
    return dict_dataset

# -- TODO: handle multi-token entities # I think I've done it :D
# tricyclic antidepressants - 'O' should be Group
# chondroitin ABC lyase - 'O' should be drug_n
# heparinase III - 'O' should be drug_n
# hyaluronan lyase - 'O' should be drug_n
# Mercaptopurine/Azathioprine - 'O' should be drug
# muscle relaxants 

In [4]:
path_train = "../../labAHLT/data/train"
path_dev = "../../labAHLT/data/devel"
# path_train = "drive/MyDrive/UPC/labAHLT/data/train"
# path_dev = "drive/MyDrive/UPC/labAHLT/data/devel"


train_dataset = load_data(path_train)
devel_dataset = load_data(path_dev)

print(train_dataset['DDI-DrugBank.d661.s5'])

[('Interactions', 0, 11, 'O'), ('observed', 23, 30, 'O'), ('nondepolarizing', 43, 57, 'B-group'), ('muscle', 59, 64, 'I-group'), ('relaxants', 66, 74, 'I-group'), ('administered', 86, 97, 'O'), ('succession', 102, 111, 'O'), ('.', 112, 112, 'O')]


In [5]:
def create_index(dataset, max_length):
    '''
    Task: Create index dictionaries both for input (words) and output (labels) from given dataset
    Input: 
        dataset: dataset produced by load_data.
        max_length: maximum length of a sentence (longer sentences will be cut, shorter ones will be padded).
        
    Output: A dictionary where each key is an index name (e.g. "words", "labels"), and the value is a 
            dictionary mapping each word/label to a number. An entry with the value for maxlen is also stored
    Example: 
        >>> create_indx(traindata)
        {'words': {'<PAD>':0, '<UNK>':1, '11-day':2, 'murine':3, 'criteria':4,
                   'stroke':5, ... ,'levodopa':8511, 'terfenadine': 8512}
         'labels': {'<PAD>':0, 'B-group':1, 'B-drug_n':2, 'I-drug_n':3, 'O':4, 
                    'I-group':5, 'B-drug':6, 'I-drug':7, 'B-brand':8, 'I-brand':9}
         'maxlen': 100 }
    '''
    
    index_words = {'<PAD>':0, '<UNK>':1}
    i = 2
    
    index_labels = {'<PAD>':0}
    j = 1
    
    
    
    for key, item in dataset.items():
        for t in item:
            word = t[0].lower() # use lower case words? 
            tag = t[3]
            if word not in index_words:
                index_words[word] = i
                i += 1
            if tag not in index_labels:
                index_labels[tag] = j
                j += 1

    indexs = {'words': index_words, 'labels': index_labels, 'maxlen':max_length}
    
    return indexs    

In [6]:
idx = create_index(train_dataset, 20)
idx['labels']

{'<PAD>': 0,
 'O': 1,
 'B-brand': 2,
 'B-group': 3,
 'B-drug': 4,
 'I-brand': 5,
 'B-drug_n': 6,
 'I-drug': 7,
 'I-group': 8,
 'I-drug_n': 9}

In [7]:
def encode_words(dataset, idx):
    '''
    Task: Encode the words in a sentence dataset formed by lists of tokens into lists of indexes
          suitable for NN input.
    Input: 
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_indexs, containing word and label indexes, as well
             as the maximum sentence length.
             
    Output: The dataset encoded as a list of sentence, each of them is a list of word indices.
            If the word is not in the index, <UNK> code is used. If the sentence is shorter than
            max_len it is padded with <PAD> code.
    Example: 
        >>> encode_words(traindata, idx)
            [ [6882 1049 4911 ... 0 0 0 ]
            [  2290 7548 8069 ... 0 0 0 ]
               ...
            [  2002 6582 7518 ... 0 0 0 ] ]
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        for t in item:
            w = str(t[0]).lower() # When using lower case words
            if w in idx['words']:
                i = idx['words'][w]
            else:
                i = idx['words']['<UNK>']
            aux.append(i)
        seq.append(aux)  
    
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    
    return seq_padded

def encode_labels(dataset, idx):
    '''
    Task: Encode the ground truth labels in a dataset formed by lists of tokens into lists of indexes
        suitable for NN output.
    Input:
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_index, containing word and label indexes, as well as the maximum length.
        
    Output: The dataset encoded as a list of sentence, each of them is a list of BIO label indices. If the sentence
            is shorter than max_len it is padded with <PAD> code. 
    
    Example :
     >>> encode_labels ( traindata , idx )
        [[ [4] [6] [4] [4] [4] [4] ... [0] [0] ]
        [  [4] [4] [8] [4] [6] [4] ... [0] [0] ]
          ...
        [
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        for t in item:
            w = t[3]
            i = idx['labels'][w]
            aux.append(i)
        seq.append(aux)
    seq_padded = [x  for x in pad_sequences(maxlen = max_length, sequences = seq, padding = 'post', truncating="post")]
    seq_categ = [to_categorical(i, num_classes = 10) for i in seq_padded]  # 9 classes + 1 PAD
    
    return seq_padded, seq_categ

In [8]:
def create_suffix_index(word_index):
    suffix_dict = {}
    i = 0
    for word in word_index['words']:
        suf = word[-4:]
        if suf not in suffix_dict:
            suffix_dict[suf] = i
            i+=1
    return suffix_dict

suffix_index = create_suffix_index(idx)

def encode_suffixes(dataset, idx, suf_index):
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        for t in item:
            w = str(t[0]).lower()[-4:] # When using lower case words
            if w in suf_index:
                i = suf_index[w]
            else:
                i = suf_index['UNK>']
            aux.append(i)
        seq.append(aux)  
    
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    
    return seq_padded    

In [358]:
pos_tags = ['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']
def create_pos_index():
    pos_dict = {'<UNK>': 0}
    i = 1
    for t in pos_tags:
        pos_dict[t] = i
        i+=1
    return pos_dict

pos_index = create_pos_index()

def encode_pos_tags(dataset, idx, pos_tag_index):
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        sentence = [t[0] for t in item]
        pos_tags = nltk.pos_tag(sentence)
        for w, tag in pos_tags:
             # When using lower case words
            if tag in pos_tag_index:
                i = pos_tag_index[tag]
            else:
                i = pos_tag_index['<UNK>']
            aux.append(i)
        seq.append(aux)  
    
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    
    return seq_padded   

In [359]:
X_train = encode_words(train_dataset, idx)
Y, Y_train = encode_labels(train_dataset, idx)
X_train_suf = encode_suffixes(train_dataset, idx, suffix_index)
X_train_pos = encode_pos_tags(train_dataset, idx, pos_index)

X_dev = encode_words(devel_dataset, idx)
Ydev, Y_dev = encode_labels(devel_dataset, idx)
X_dev_suf = encode_suffixes(devel_dataset, idx, suffix_index)
X_dev_pos = encode_pos_tags(devel_dataset, idx, pos_index)


In [360]:
print(X_train)
print(X_train_pos)

[[   2    3    4 ...    0    0    0]
 [   2   12    3 ...    0    0    0]
 [  19   20   21 ...    0    0    0]
 ...
 [   2  101 3387 ...    0    0    0]
 [1159  910 5135 ...    0    0    0]
 [  19  910 5135 ... 4830   66   11]]
[[13 41 39 ...  0  0  0]
 [13 12 41 ...  0  0  0]
 [30 39 29 ...  0  0  0]
 ...
 [13 12  7 ...  0  0  0]
 [39  8 41 ...  0  0  0]
 [30  8 41 ... 12  3 22]]


In [319]:
print(Y[0])
print(Y_train[0][0:10])

[1 1 2 1 1 3 1 1 3 1 0 0 0 0 0 0 0 0 0 0]
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [320]:
## GloVe
# http://nlp.stanford.edu/data/glove.6B.zip
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index['words']) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index['words']:
                idx = word_index['words'][word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix
glove_path = "../../labAHLT/data/glove.6B/glove.6B.300d.txt"
#glove_path = "drive/MyDrive/UPC/labAHLT/data/glove.6B/glove.6B.300d.txt"

embedding_matrix = create_embedding_matrix(glove_path, idx, 64)


In [321]:

def create_suffix_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix
glove_path = "../../labAHLT/data/glove.6B/glove.6B.300d.txt"
#glove_path = "drive/MyDrive/UPC/labAHLT/data/glove.6B/glove.6B.300d.txt"



suffix_embedding_matrix = create_suffix_embedding_matrix(glove_path, suffix_index, 64)


***

In [398]:
# https://github.com/luozhouyang/keras-crf
def build_network(idx):
    '''
    Task: Create network for the learner. 
    Input:
        idx: index dictionary with word/labels codes, plus maximum sentence length.
    Output: Returns a compiled Keras neural network with the specified layers
    '''
    
    #sizes
    n_words = len(idx['words'])
    n_labels = len(idx['labels'])#+1
    max_len = idx['maxlen']
    suffix_dict = create_suffix_index(idx)
    n_suf = len(suffix_dict)

    model = Sequential()
    
    # create network layers
    inp = Input(shape=(max_len,), name="words")
    inp_suf = Input(shape=(max_len,), name="suf")
    inp_pos = Input(shape=(max_len,), name="pos_tags")
    model1 = Embedding(input_dim=n_words+1, output_dim=64, weights=[embedding_matrix], input_length=max_len)(inp)
    model2 = Embedding(input_dim =n_suf+1, output_dim=64, weights=[suffix_embedding_matrix], input_length=max_len)(inp_suf)
    model3 = Embedding(input_dim =n_suf+1, output_dim=64, input_length=max_len)(inp_pos)

    #     model.add(emb_layer)
    
    #model = Embedding(input_dim=n_words + 1, output_dim = n_labels, input_length = max_len)(inp)
    model1 = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.1, dropout=0.2))(model1)
    model2 = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.1, dropout=0.2))(model2)
    model3 = Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.1, dropout=0.2))(model3)
    model = Concatenate()([model1, model2, model3])
    model = Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(model)
    model = Bidirectional(SimpleRNN(64, return_sequences=True))(model)
    # model = add([model, model2])
    model = Dense(n_labels, activation="softmax")(model)
    print(model.shape)
    #crf = tfa.layers.CRF(n_labels)
#     crf = Layer(CRF(n_labels))
#     model.add(crf)

#     model.add(dense)
    
    # create and compile model
#     model = Model(inp, model)
                    
    #model = CRFModel(model, n_labels)
    
    model = Model(inputs=[inp, inp_suf, inp_pos], outputs=model)
    optimiz = Adam(lr=0.005, amsgrad=True, epsilon=1e-7)
    model.compile(optimizer=optimiz, loss='categorical_crossentropy', metrics=["accuracy"])
    
    return model

***

In [399]:
import tensorflow as tf
model = build_network(idx)
model.summary()

(None, 20, 10)
Model: "model_80"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words (InputLayer)              [(None, 20)]         0                                            
__________________________________________________________________________________________________
suf (InputLayer)                [(None, 20)]         0                                            
__________________________________________________________________________________________________
pos_tags (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding_136 (Embedding)       (None, 20, 64)       531520      words[0][0]                      
____________________________________________________________________________



In [400]:
history = model.fit({"words": X_train, "suf": X_train_suf, "pos_tags": X_train_pos}, np.array(Y_train),
                   batch_size=16,
                   epochs=1,
                   verbose=1, 
                   validation_data=({"words": X_dev, "suf": X_dev_suf, "pos_tags": X_dev_pos}, np.array(Y_dev)))



In [328]:
def save_model_and_indexes(model, idx):#, filename):
    '''
    Task: Save given model and indexs to disk
    Input: 
        model: Keras model created by _build_network, and trained.
        idx: A dictionary produced by create_indexs, containing word and label indexes, 
             as well as the maximum sentence length. 
        filename: Saves the mode into filename.nn and the indexes into filename.idx
    '''
    model.save("ner-nn.nn")
    
    file = open("index_ner.pkl", "wb")
    pickle.dump(idx, file)
    file.close()

## `Learner()`

In [401]:
def learner(traindir, validationdir):#, modelname):
    '''
    Learns a NN model using traindir as training data, and validationdir as validation data.
    Saves learnt model in a file named modelname
    '''
    # load train and validation data in a suitable form
    train_dataset = load_data(traindir)
    val_dataset = load_data(validationdir)
    
    # create indexes from trainindg data 
    max_len = 20
    idx = create_index(train_dataset, max_len)
    
    # build network 
    model = build_network(idx)
    
    # encode datasets
    Xtrain = encode_words(train_dataset, idx)
    Y, Ytrain = encode_labels(train_dataset, idx)
    Xval = encode_words(val_dataset, idx)
    Yv, Yval = encode_labels(val_dataset, idx)
    
    X_train_suf = encode_suffixes(train_dataset, idx, suffix_index)
    X_train_pos = encode_pos_tags(train_dataset, idx, pos_index)
    X_val_suf = encode_suffixes(val_dataset, idx, suffix_index)
    X_val_pos = encode_pos_tags(val_dataset, idx, pos_index)

    # train model
#     model.fit(Xtrain, np.array(Ytrain),
#               batch_size=16,
#               epochs=3,
#               verbose=1,
#               validation_data=(Xval, np.array(Yval)))

    model.fit({"words": Xtrain, "suf": X_train_suf, "pos_tags": X_train_pos}, np.array(Y_train),
                   batch_size=16,
                   epochs=3,
                   verbose=1, 
                   validation_data=({"words": Xval, "suf": X_val_suf, "pos_tags": X_val_pos}, np.array(Y_dev)))
    
    # save model and indexs, for later use in prediction
    save_model_and_indexes(model, idx)#, path_dev.split('/')[-1]+"NER-learned")

In [402]:
path_train = "../../labAHLT/data/train"
path_dev = "../../labAHLT/data/devel"
learner(path_train, path_dev)

(None, 20, 10)
Epoch 1/3
Epoch 2/3
Epoch 3/3
INFO:tensorflow:Assets written to: ner-nn.nn/assets


INFO:tensorflow:Assets written to: ner-nn.nn/assets


***
## Functions Classifier

In [403]:
def load_model_and_indexs():
    '''
    Task: Load model and associate indexs from disk.
    Input:
        filename: filename to be loaded
    Output: Loads a model from filename.nn and its indexes from filename.idx
            Returns the loaded model and indexes. 
    '''
    model = load_model("ner-nn.nn")
    index = open("index_ner.pkl", "rb")
    idx = pickle.load(index)
    
    return model, idx

In [404]:
def output_entities(dataset, preds, outfile):
    '''
    Task: Output detected entities in the format expected by the evaluator
    Input: 
        dataset: A dataset produced by load_data.
        preds: For each sentence in dataset, a list with the labels for each sentence token, 
               as predicted by the model.
    Output: prints the detected entities to stdout in the format required by the evaluator. 
    '''
    outf = open(outfile, 'w')
    for sentence, pred in zip(dataset.items(), preds):
#         print(sentence, pred)
        sid = sentence[0]
        tokens = sentence[1]
        for i in range(min(len(tokens), len(pred))):
            token = tokens[i]
            label = pred[i]
            if label[0] == 'B':
                offset_from = str(token[1])
                offset_to = str(token[2])
                tag_name = label[2:]
                entity = token[0]
                j = i+1
                while j < len(tokens) and len(tokens[j]) >=3 and j>len(pred): ## I added j>len(pred) because an error of index range arised
                    token_next = tokens[j]
                    word_next = token_next[0]
                    offset_from_next = str(token_next[1])
                    offset_to_next = str(token_next[2])
                    tag_next = pred[j]
                    j += 1
                    if int(offset_from_next) - int(offset_to) > 3 or tag_next[0] != 'I':
                        break
                    if tag_next[2:] == tag_name:
                        entity = entity + ' ' + word_next
                        offset_to = offset_to_next
                outf.write(sid + "|" + offset_from + '-' + offset_to + "|" + entity + "|" + tag_name+'\n')
                print(sid + "|" + offset_from + '-' + offset_to + "|" + entity + "|" + tag_name)

In [405]:
# Y_pred = model.predict(X_dev)
# Y = np.argmax(Y_pred, axis=-1)
# # for y in Y_pred:
# #     for s in y:
# #         print(s)
# #         print(np.argmax(s))
# key_list = list(idx['labels'].keys())
# val_list = list(idx['labels'].values())
# for y in Y:
#     for t in y:
#         if t in [3,4,5,6,7,8,9]: 
#             print(key_list[t])
#         break

## `Classifier()`

In [406]:
def predict(datadir, outfile):
    '''
    Loads a NN model from a file 'modelname' and uses it to extract drugs in datadir. Saves
    results to 'outfile' in the appropriate format
    '''
    
    # load model and associated encoding data
    model, idx = load_model_and_indexs()
    
    # load data to annotate
    testdata = load_data(datadir)
#     print(idx)
    
    # encode dataset
    X = encode_words(testdata, idx)
    X_suf = encode_suffixes(testdata, idx, suffix_index)
    X_pos = encode_pos_tags(testdata, idx, pos_index)
    
    # tag sentences in dataset
    Y = model.predict({"words": X, "suf": X_suf, "pos_tags": X_pos})
    Y = [[find_label(idx, np.argmax(y)) for y in s] for s in Y]
    
    # extract entities and dump them to output file
    output_entities(testdata, Y, outfile)
    
    # evaluate using official evaluator
    evaluate("NER", datadir, outfile)

In [407]:
def find_label(idx, predicted):
    for label, i in idx["labels"].items():
        if i == predicted:
            return label

In [408]:
path_test = "../../labAHLT/data/test"
predict(path_test, "NER-result2")

DDI-DrugBank.d485.s0|0-6|Ethinyl|drug
DDI-DrugBank.d485.s2|0-12|Acetaminophen|drug
DDI-DrugBank.d485.s2|52-60|synthetic|group
DDI-DrugBank.d485.s3|12-19|hormonal|group
DDI-DrugBank.d485.s3|82-94|acetaminophen|drug
DDI-DrugBank.d485.s4|0-8|Acitretin|drug
DDI-DrugBank.d485.s5|20-33|progestational|group
DDI-DrugBank.d485.s6|0-16|Aminoglutethimide|drug
DDI-DrugBank.d485.s6|50-59|progestins|group
DDI-DrugBank.d485.s6|93-105|contraceptive|group
DDI-DrugBank.d485.s7|9-19|nonhormonal|group
DDI-DrugBank.d485.s9|9-19|nonhormonal|group
DDI-DrugBank.d485.s10|0-13|Anticoagulants|group
DDI-DrugBank.d485.s10|28-35|hormonal|group
DDI-DrugBank.d485.s10|92-99|coumarin|group
DDI-DrugBank.d485.s11|12-19|hormonal|group
DDI-DrugBank.d485.s12|0-14|Anticonvulsants|group
DDI-DrugBank.d485.s12|17-29|carbamazepine|drug
DDI-DrugBank.d485.s12|32-40|felbamate|drug
DDI-DrugBank.d485.s12|43-55|phenobarbital|drug
DDI-DrugBank.d485.s12|58-66|phenytoin|drug
DDI-DrugBank.d485.s12|69-78|topiramate|drug
DDI-DrugBank.d485.s

Ignoring duplicated entity in system predictions file: DDI-DrugBank.d485.s16|0-11|Atorvastatin|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d485.s23|0-11|Griseofulvin|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d485.s36|0-7|Rifampin|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d485.s43|62-69|caffeine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d167.s2|66-76|anesthetics|group
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d480.s10|22-32|epinephrine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d480.s15|47-55|Clozapine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d480.s18|42-54|carbamazepine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d480.s18|28-36|Clozapine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d480.s20|66-74|clozapine|drug
Ignoring dup