## Neural Network NERC

In [2]:
import numpy as np
from os import listdir
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
#from keras_contrib.layers import CRF

%matplotlib inline

## Functions Learner

In [3]:
def load_data(datadir):
    '''
    Task: Load XML files in given directory, tokenize each sentence, and extract
    ground truth BIO labels for each token.
    
    Input: 
        datadir: A directory containing XML files
        
    Output: A directory containing the dataset. Dictionary key is sentence_id, and the 
            value is a list of token tuples (word, start, end, ground truth).
            
    Example: 
            >>> load_data('data/Train')
            {'DDI-DrugBank.d370.s0': [('as', 0, 1,'O'), ('differin', 3, 10,'B-brand'),
                     ('gel', 12, 14,'O'), ... , ('with', 343, 346, 'O'),
                     ('caution', 348, 354, 'O'), ('.', 355, 355, 'O')],
            'DDI-DrugBank.d370.s1': [('particular', 0, 9, 'O'), ('caution', 11, 17, 'O'),
                     ('should', 19, 24, 'O'), ... , ('differin', 130, 137, 'B-brand'),
                     ('gel', 139, 141, 'O'), ('.', 142, 142, 'O')], ... }
    '''
    
    sentences_ids = []
    sentences_dataset = []
    labels = []
    dict_dataset = {}
    dict_labels = {}
    
    for f in listdir(datadir):
        tree = parse(datadir + "/" + f)
        
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences: 
            sid = s.attributes["id"].value
            sentences_ids.append(sid)
            
            stext = s.attributes["text"].value
            stext = stext.replace("-"," ")
            sentences_dataset.append(stext)
            
            ents = {}
            entities = s.getElementsByTagName("entity")
            for e in entities:
                offset = e.attributes["charOffset"].value
                start = offset.split('-')[0]
                end = offset.split('-')[1]
                name = e.attributes["text"].value
                e_type = e.attributes["type"].value
                ents[name] = [e_type, start]
            
            punct = [".",",",";",":","?","!"]
            tokens = word_tokenize(stext)
            tokens_cleaned = []
            for t in tokens:
                if t not in punct:
                    tokens_cleaned.append(t)
            
            tags = []
            tokens = []
            for t in tokens_cleaned:
                offsetFrom = stext.find(t)
                offsetTo = offsetFrom + len(t) - 1
                if t in ents:
                    if (int(ents[t][1]) == 0):
                        tag = "B-"+ents[t][0] # TODO: ents after .?
                    else:
                        tag = "I-"+ents[t][0]
                else:
                    tag = "O"
                tags.append(tag)
                tupl = (t, offsetFrom, offsetTo, tag)
                tokens.append(tupl)
            
            dict_dataset[sid] = tokens
            dict_labels[sid] = tags
            labels.append(tags)
        
    return dict_dataset, dict_labels, sentences_dataset, sentences_ids, labels

In [4]:
path_train = "/Users/mponsclo/Documents/Master/labAHLT/data/train"
train_dataset, dict_labels, train_sentences, train_ids, train_labels = load_data(path_train)
n_words = sum([len(t) for t in train_sentences]) ; n_words # total number of words in train dataset
n_tags = 9 

In [5]:
def create_index(dataset, max_length):
    '''
    Task: Create index dictionaries both for input (words) and output (labels) from given dataset
    Input: 
        dataset: dataset produced by load_data.
        max_length: maximum length of a sentence (longer sentences will be cut, shorter ones will be padded).
        
    Output: A dictionary where each key is an index name (e.g. "words", "labels"), and the value is a 
            dictionary mapping each word/label to a number. An entry with the value for maxlen is also stored
    Example: 
        >>> create_indx(traindata)
        {'words': {'<PAD>':0, '<UNK>':1, '11-day':2, 'murine':3, 'criteria':4,
                   'stroke':5, ... ,'levodopa':8511, 'terfenadine': 8512}
         'labels': {'<PAD>':0, 'B-group':1, 'B-drug_n':2, 'I-drug_n':3, 'O':4, 
                    'I-group':5, 'B-drug':6, 'I-drug':7, 'B-brand':8, 'I-brand':9}
         'maxlen': 100 }
    '''
    
    tokenizer = Tokenizer(num_words=8000, lower = True, char_level=False, oov_token="<UNK>")
    tokenizer_labs = Tokenizer(num_words=12, lower=False, char_level=False, oov_token="<UNK>")
    
    tokenizer.fit_on_texts(train_sentences)
    word_index = tokenizer.word_index
    #print(word_index)

    tokenizer_labs.fit_on_texts(train_labels)
    word_index_labs = tokenizer_labs.word_index
    #print(word_index_labs)

    indexs = {}
    indexs['words'] = word_index
    indexs['labels'] = word_index_labs
    indexs['maxlen'] = max_length
    
    return indexs    

In [6]:
idx = create_index(train_sentences, 100)
#idx['words']
idx['labels']
#idx['maxlen']

{'<UNK>': 1,
 'O': 2,
 'I-drug': 3,
 'I-group': 4,
 'I-brand': 5,
 'B-drug': 6,
 'I-drug_n': 7,
 'B-group': 8,
 'B-brand': 9,
 'B-drug_n': 10}

In [7]:
def encode_words(dataset, idx):
    '''
    Task: Encode the words in a sentence dataset formed by lists of tokens into lists of indexes
          suitable for NN input.
    Input: 
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_indexs, containing word and label indexes, as well
             as the maximum sentence length.
             
    Output: The dataset encoded as a list of sentence, each of them is a list of word indices.
            If the word is not in the index, <UNK> code is used. If the sentence is shorter than
            max_len it is padded with <PAD> code.
    Example: 
        >>> encode_words(traindata, idx)
            [ [6882 1049 4911 ... 0 0 0 ]
            [  2290 7548 8069 ... 0 0 0 ]
               ...
            [  2002 6582 7518 ... 0 0 0 ] ]
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in train_dataset.items():
        aux = []
        for t in item:
            w = str(t[0]).lower()
            if w in idx['words']:
                i = idx['words'][w]
            else:
                i = idx['words']['<UNK>']
            aux.append(i)
        seq.append(aux)  
    
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    
    return seq_padded

def encode_labels(dataset, idx):
    '''
    Task: Encode the ground truth labels in a dataset formed by lists of tokens into lists of indexes
        suitable for NN output.
    Input:
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_index, containing word and label indexes, as well as the maximum length.
        
    Output: The dataset encoded as a list of sentence, each of them is a list of BIO label indices. If the sentence
            is shorter than max_len it is padded with <PAD> code. 
    
    Example :
     >>> encode_labels ( traindata , idx )
        [[ [4] [6] [4] [4] [4] [4] ... [0] [0] ]
        [  [4] [4] [8] [4] [6] [4] ... [0] [0] ]
          ...
        [
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in dict_labels.items():
        aux = []
        for t in item:
            w = str(t)
            i = idx['labels'][w]
            aux.append(i)
        seq.append(aux)
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    seq_categ = [to_categorical(i, num_classes = 11) for i in seq_padded]  # 10 classes + 1 UNK
    
    return seq_padded, seq_categ

In [8]:
X_train = encode_words(train_dataset, idx)
Y, Y_train = encode_labels(dict_labels, idx)

In [9]:
print(X_train)

[[1489 1489  386 ...    0    0    0]
 [  50  101 2385 ...    0    0    0]
 [ 181   48 2056 ...    0    0    0]
 ...
 [  80    5   16 ...    0    0    0]
 [  57  573   41 ...    0    0    0]
 [  12  317    6 ...    0    0    0]]


In [10]:
print(Y[0])
print(Y_train[0])

[2 2 2 2 3 2 2 2 2 2 2 2 2 2 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [228]:
def build_network(idx):
    '''
    Task: Create network for the learner. 
    Input:
        idx: index dictionary with word/labels codes, plus maximum sentence length.
    Output: Returns a compiled Keras neural network with the specified layers
    '''
    
    #sizes
    n_words = len(idx['words'])
    n_labels = len(idx['labels'])+1
    max_len = idx['maxlen']
    
    # create network layers
    inp = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words + 1, output_dim = n_labels, input_length = max_len)(inp)
    model = Dropout(0.2)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_labels, activation="softmax"))(model)
    
    # create and compile model
    model = Model(inp, out)
    
    optimiz = Adam(lr=0.01, decay=1e-6)
    model.compile(optimizer=optimiz, loss="categorical_crossentropy", metrics=["accuracy"])
    
    return model

In [209]:
model = build_network(idx)
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 100, 11)           81136     
_________________________________________________________________
dropout_8 (Dropout)          (None, 100, 11)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 100, 200)          89600     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 100, 11)           2211      
Total params: 172,947
Trainable params: 172,947
Non-trainable params: 0
_________________________________________________________________


In [210]:
history = model.fit(X_train, np.array(Y_train),
                   batch_size=32,
                   epochs=2,
                   verbose=1)

Epoch 1/2
Epoch 2/2


In [None]:
def save_model_and_indexes(model, idx, filename):
    '''
    Task: Save given model and indexs to disk
    Input: 
        model: Keras model created by _build_network, and trained.
        idx: A dictionary produced by create_indexs, containing word and label indexes, 
             as well as the maximum sentence length. 
        filename: Saves the mode into filename.nn and the indexes into filename.idx
    '''
    
    # Use Keras.model.save and keras.models.load_model functions to save/load the model
    # Use your preferred method (pickel, plain text, etc) to save/load the index dict.

## `Learner()`

In [231]:
def learner(traindir, validationdir):#, modelname):
    '''
    Learns a NN model using traindir as training data, and validationdir as validation data.
    Saves learnt model in a file named modelname
    '''
    # load train and validation data in a suitable form
    train_dataset, dict_labels, train_sentences, train_ids, train_labels = load_data(traindir)
    val_dataset, dictv_labels, val_sentences, val_ids, val_labels = load_data(validationdir)
    
    # create indexes from trainindg data 
    max_len = 100
    idx = create_index(train_sentences, max_len)
    
    # build network 
    model = build_network(idx)
    
    # encode datasets
    Xtrain = encode_words(train_dataset, idx['words'], idx['maxlen'])
    Y, Ytrain = encode_labels(dict_labels, idx['labels'], idx['maxlen'])
    Xval = encode_words(val_dataset, idx['words'], idx['maxlen'])
    Yv, Yval = encode_labels(dictv_labels, idx['labels'], idx['maxlen'])
    
    # train model
    history = model.fit(Xtrain, np.array(Ytrain),
                        batch_size=32,
                        epochs=2,
                        verbose=1,
                        validation_data=(Xval, np.array(Yval)))
    
    # save model and indexs, for later use in prediction
    #save_model_and_indexs(model, idx, modelname)

In [232]:
path_train = "/Users/mponsclo/Documents/Master/labAHLT/data/train"
path_dev = "/Users/mponsclo/Documents/Master/labAHLT/data/devel"
learner(path_train, path_dev)

Epoch 1/2
Epoch 2/2


***
## Functions Classifier

In [None]:
def load_model_and_indexs(filename):
    '''
    Task: Load model and associate indexs from disk.
    Input:
        filename: filename to be loaded
    Output: Loads a model from filename.nn and its indexes from filename.idx
            Returns the loaded model and indexes. 
    '''
    
    # Use Keras.model.save and keras.models.load_model functions to save/load the model
    # Use your preferred method (pickel, plain text, etc) to save/load the index dict.

In [None]:
def output_entities(dataset, preds):
    '''
    Task: Output detected entities in the format expected by the evaluator
    Input: 
        dataset: A dataset produced by load_data.
        preds: For each sentence in dataset, a list with the labels for each sentence token, 
               as predicted by the model.
    Output: prints the detected entities to stdout in the format required by the evaluator. 
    '''

## `Classifier()`

In [None]:
def predict(modelname, datadir, outfile):
    '''
    Loads a NN model from a file 'modelname' and uses it to extract drugs in datadir. Saves
    results to 'outfile' in the appropriate format
    '''
    
    # load model and associated encoding data
    model, idx = load_model_and_indexs(modelname)
    
    # load data to annotate
    testdata = load_data(datadir)
    
    # encode dataset
    X = encode_words(testdata, idx)
    
    # tag sentences in dataset
    Y = model.predict(X)
    # get most likely tag for each word
    Y = [[idx['labels'][np.argmax(y)] for y in s] for s in Y]
    
    # extract entities and dump them to output file
    output_entities(testdata, Y, outfile)
    
    # evaluate using official evaluator
    evaluation(datadir, outfile)