## Neural Network NER

In [1]:
import numpy as np
from os import listdir
import pickle
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from keras.models import Model, Input
from keras.layers import *
#from keras_contrib.layers import CRF

stopwords = set(stopwords.words("english"))
from evaluator import *

%matplotlib inline

## Functions Learner

In [18]:
def load_data(datadir):
    '''
    Task: Load XML files in given directory, tokenize each sentence, and extract
    ground truth BIO labels for each token.
    
    Input: 
        datadir: A directory containing XML files
        
    Output: A directory containing the dataset. Dictionary key is sentence_id, and the 
            value is a list of token tuples (word, start, end, ground truth).
            
    Example: 
            >>> load_data('data/Train')
            {'DDI-DrugBank.d370.s0': [('as', 0, 1,'O'), ('differin', 3, 10,'B-brand'),
                     ('gel', 12, 14,'O'), ... , ('with', 343, 346, 'O'),
                     ('caution', 348, 354, 'O'), ('.', 355, 355, 'O')],
            'DDI-DrugBank.d370.s1': [('particular', 0, 9, 'O'), ('caution', 11, 17, 'O'),
                     ('should', 19, 24, 'O'), ... , ('differin', 130, 137, 'B-brand'),
                     ('gel', 139, 141, 'O'), ('.', 142, 142, 'O')], ... }
    '''
    
    dict_dataset = {}
    
    for f in listdir(datadir):
        tree = parse(datadir + "/" + f)
        
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences: 
            sid = s.attributes["id"].value
            
            stext = s.attributes["text"].value
            #stext = stext.replace("-"," ") if used we lose beta-endorphin, if not use we lose calcium-rich tag
            
            ents = {}
            entities = s.getElementsByTagName("entity")
            for e in entities:
                offset = e.attributes["charOffset"].value
                start = offset.split('-')[0]
                end = offset.split('-')[1]
                name = e.attributes["text"].value
                e_type = e.attributes["type"].value
                ents[name] = [e_type, start, end]
            
            punct = [",",";",":","?","!", "(", ")"] # removed "."
            tokens = word_tokenize(stext)
            tokens_cleaned = []
            for t in tokens:
                if t not in punct:
                    tokens_cleaned.append(t)
            
            tags = []
            tokens = []
            for t in tokens_cleaned:
                offsetFrom = stext.find(t)
                offsetTo = offsetFrom + len(t) - 1
                if t in ents:
                    if (int(ents[t][1]) == 0):
                        tag = "B-"+ents[t][0] # TODO: ents after .?
                    else:
                        tag = "I-"+ents[t][0]
                else:
                    tag = "O"
                tags.append(tag)
                tupl = (t, offsetFrom, offsetTo, tag)
                tokens.append(tupl)
            
            dict_dataset[sid] = tokens
        
    return dict_dataset

# -- TODO: handle multi-token entities
# tricyclic antidepressants - 'O' should be Group
# chondroitin ABC lyase - 'O' should be drug_n
# heparinase III - 'O' should be drug_n
# hyaluronan lyase - 'O' should be drug_n
# Mercaptopurine/Azathioprine - 'O' should be drug
# muscle relaxants 

In [49]:
path_train = "../../labAHLT/data/train"
path_dev = "../../labAHLT/data/devel"

train_dataset = load_data(path_train)
devel_dataset = load_data(path_dev)

print(train_dataset['DDI-DrugBank.d661.s5'])

[('Interactions', 0, 11, 'O'), ('have', 13, 16, 'O'), ('been', 18, 21, 'O'), ('observed', 23, 30, 'O'), ('when', 32, 35, 'O'), ('other', 37, 41, 'O'), ('nondepolarizing', 43, 57, 'O'), ('muscle', 59, 64, 'O'), ('relaxants', 66, 74, 'O'), ('have', 13, 16, 'O'), ('been', 18, 21, 'O'), ('administered', 86, 97, 'O'), ('in', 55, 56, 'O'), ('succession', 102, 111, 'O'), ('.', 112, 112, 'O')]


In [30]:
def create_index(dataset, max_length):
    '''
    Task: Create index dictionaries both for input (words) and output (labels) from given dataset
    Input: 
        dataset: dataset produced by load_data.
        max_length: maximum length of a sentence (longer sentences will be cut, shorter ones will be padded).
        
    Output: A dictionary where each key is an index name (e.g. "words", "labels"), and the value is a 
            dictionary mapping each word/label to a number. An entry with the value for maxlen is also stored
    Example: 
        >>> create_indx(traindata)
        {'words': {'<PAD>':0, '<UNK>':1, '11-day':2, 'murine':3, 'criteria':4,
                   'stroke':5, ... ,'levodopa':8511, 'terfenadine': 8512}
         'labels': {'<PAD>':0, 'B-group':1, 'B-drug_n':2, 'I-drug_n':3, 'O':4, 
                    'I-group':5, 'B-drug':6, 'I-drug':7, 'B-brand':8, 'I-brand':9}
         'maxlen': 100 }
    '''
    
    index_words = {'<PAD>':0, '<UNK>':1}
    i = 2
    
    index_labels = {'<PAD>':0}
    j = 1
    
    for key, item in dataset.items():
        for t in item:
            word = t[0].lower() # use lower case words? 
            tag = t[3]
            if word not in index_words:
                index_words[word] = i
                i += 1
            if tag not in index_labels:
                index_labels[tag] = j
                j += 1

    indexs = {'words': index_words, 'labels': index_labels, 'maxlen':max_length}
    
    return indexs    

In [31]:
idx = create_index(train_dataset, 100)
idx['labels']

{'<PAD>': 0,
 'O': 1,
 'I-brand': 2,
 'I-group': 3,
 'I-drug': 4,
 'I-drug_n': 5,
 'B-drug_n': 6,
 'B-brand': 7,
 'B-drug': 8,
 'B-group': 9}

In [71]:
def encode_words(dataset, idx):
    '''
    Task: Encode the words in a sentence dataset formed by lists of tokens into lists of indexes
          suitable for NN input.
    Input: 
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_indexs, containing word and label indexes, as well
             as the maximum sentence length.
             
    Output: The dataset encoded as a list of sentence, each of them is a list of word indices.
            If the word is not in the index, <UNK> code is used. If the sentence is shorter than
            max_len it is padded with <PAD> code.
    Example: 
        >>> encode_words(traindata, idx)
            [ [6882 1049 4911 ... 0 0 0 ]
            [  2290 7548 8069 ... 0 0 0 ]
               ...
            [  2002 6582 7518 ... 0 0 0 ] ]
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        for t in item:
            w = str(t[0]).lower() # When using lower case words
            if w in idx['words']:
                i = idx['words'][w]
            else:
                i = idx['words']['<UNK>']
            aux.append(i)
        seq.append(aux)  
    
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    
    return seq_padded

def encode_labels(dataset, idx):
    '''
    Task: Encode the ground truth labels in a dataset formed by lists of tokens into lists of indexes
        suitable for NN output.
    Input:
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_index, containing word and label indexes, as well as the maximum length.
        
    Output: The dataset encoded as a list of sentence, each of them is a list of BIO label indices. If the sentence
            is shorter than max_len it is padded with <PAD> code. 
    
    Example :
     >>> encode_labels ( traindata , idx )
        [[ [4] [6] [4] [4] [4] [4] ... [0] [0] ]
        [  [4] [4] [8] [4] [6] [4] ... [0] [0] ]
          ...
        [
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        for t in item:
            w = t[3]
            i = idx['labels'][w]
            aux.append(i)
        seq.append(aux)
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    seq_categ = [to_categorical(i, num_classes = 10) for i in seq_padded]  # 9 classes + 1 PAD
    
    return seq_padded, seq_categ

In [72]:
X_train = encode_words(train_dataset, idx)
Y, Y_train = encode_labels(train_dataset, idx)

X_dev = encode_words(devel_dataset, idx)
Ydev, Y_dev = encode_labels(devel_dataset, idx)

In [73]:
print(X_train)

[[   2    2    3 ...    0    0    0]
 [  16   17   18 ...    0    0    0]
 [  25   26   27 ...    0    0    0]
 ...
 [1000   70   47 ...    0    0    0]
 [ 187  958  959 ...    0    0    0]
 [   9 1505  180 ...    0    0    0]]


In [74]:
print(Y[0])
print(Y_train[0][0:10])

[1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [76]:
def build_network(idx):
    '''
    Task: Create network for the learner. 
    Input:
        idx: index dictionary with word/labels codes, plus maximum sentence length.
    Output: Returns a compiled Keras neural network with the specified layers
    '''
    
    #sizes
    n_words = len(idx['words'])
    n_labels = len(idx['labels'])#+1
    max_len = idx['maxlen']
    
    # create network layers
    inp = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words + 1, output_dim = n_labels, input_length = max_len)(inp)
    model = Dropout(0.2)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_labels, activation="softmax"))(model)
    
    # create and compile model
    model = Model(inp, out)
    
    optimiz = Adam(lr=0.01, decay=1e-6)
    model.compile(optimizer=optimiz, loss="categorical_crossentropy", metrics=["accuracy"])
    
    return model

In [77]:
model = build_network(idx)
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 10)           83410     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 10)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100, 200)          88800     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 100, 10)           2010      
Total params: 174,220
Trainable params: 174,220
Non-trainable params: 0
_________________________________________________________________


In [78]:
history = model.fit(X_train, np.array(Y_train),
                   batch_size=32,
                   epochs=2,
                   verbose=1, 
                   validation_data=(X_dev, np.array(Y_dev)))

Epoch 1/2
Epoch 2/2


In [82]:
def save_model_and_indexes(model, idx, filename):
    '''
    Task: Save given model and indexs to disk
    Input: 
        model: Keras model created by _build_network, and trained.
        idx: A dictionary produced by create_indexs, containing word and label indexes, 
             as well as the maximum sentence length. 
        filename: Saves the mode into filename.nn and the indexes into filename.idx
    '''
    model.save("ner-nn.nn")
    
    file = open("index_ner.pkl", "wb")
    pickle.dump(idx, file)
    file.close()

## `Learner()`

In [80]:
def learner(traindir, validationdir):#, modelname):
    '''
    Learns a NN model using traindir as training data, and validationdir as validation data.
    Saves learnt model in a file named modelname
    '''
    # load train and validation data in a suitable form
    train_dataset = load_data(traindir)
    val_dataset = load_data(validationdir)
    
    # create indexes from trainindg data 
    max_len = 100
    idx = create_index(train_dataset, max_len)
    
    # build network 
    model = build_network(idx)
    
    # encode datasets
    Xtrain = encode_words(train_dataset, idx)
    Y, Ytrain = encode_labels(train_dataset, idx)
    Xval = encode_words(val_dataset, idx)
    Yv, Yval = encode_labels(val_dataset, idx)
    
    # train model
    model.fit(Xtrain, np.array(Ytrain),
              batch_size=32,
              epochs=2,
              verbose=1,
              validation_data=(Xval, np.array(Yval)))
    
    # save model and indexs, for later use in prediction
    save_model_and_indexs(model, idx)

In [81]:
path_train = "../../labAHLT/data/train"
path_dev = "../../labAHLT/data/devel"
learner(path_train, path_dev)

Epoch 1/2
Epoch 2/2


***
## Functions Classifier

In [None]:
def load_model_and_indexs(filename):
    '''
    Task: Load model and associate indexs from disk.
    Input:
        filename: filename to be loaded
    Output: Loads a model from filename.nn and its indexes from filename.idx
            Returns the loaded model and indexes. 
    '''
    model = load_model("ner-nn.nn")
    index = open("index_ner.pkl", "rb")
    idx = pickle.load(index)
    
    return model, idx

In [None]:
def output_entities(dataset, preds):
    '''
    Task: Output detected entities in the format expected by the evaluator
    Input: 
        dataset: A dataset produced by load_data.
        preds: For each sentence in dataset, a list with the labels for each sentence token, 
               as predicted by the model.
    Output: prints the detected entities to stdout in the format required by the evaluator. 
    '''

In [117]:
#Y = model.predict(X_dev)
#Y = np.argmax(Y, axis=-1)
key_list = list(idx['labels'].keys())
val_list = list(idx['labels'].values())
for y in Y:
    for t in y:
        if t in [2,3,4,5,6,7,8,9]: 
            print(key_list[t])
        break

## `Classifier()`

In [None]:
def predict(modelname, datadir, outfile):
    '''
    Loads a NN model from a file 'modelname' and uses it to extract drugs in datadir. Saves
    results to 'outfile' in the appropriate format
    '''
    
    # load model and associated encoding data
    model, idx = load_model_and_indexs(modelname)
    
    # load data to annotate
    testdata = load_data(datadir)
    
    # encode dataset
    X = encode_words(testdata, idx)
    
    # tag sentences in dataset
    Y = model.predict(X)
    # get most likely tag for each word
    Y = [[idx['labels'][np.argmax(y)] for y in s] for s in Y]
    
    # extract entities and dump them to output file
    output_entities(testdata, Y, outfile)
    
    # evaluate using official evaluator
    evaluation(datadir, outfile)