<a href="https://colab.research.google.com/github/zoftar/ahlt/blob/main/3.%20NN/NER-NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Neural Network NER

In [58]:
import numpy as np
from os import listdir
import pickle
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# import tensorflow_addons as tfa
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from keras.models import Model, Input, Sequential
from keras.layers import *
from keras_contrib.layers import CRF


stopwords = set(stopwords.words("english"))
from evaluator import *

%matplotlib inline
%tensorflow_version 1.x

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
TensorFlow is already loaded. Please restart the runtime to change versions.


In [24]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-5c56zqtn
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-5c56zqtn
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-cp37-none-any.whl size=101078 sha256=afb2164b00c098d5d97d388f3ff05c8240b49edc381e4deccb6dc6c7d65cfc16
  Stored in directory: /tmp/pip-ephem-wheel-cache-chly0nxx/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib
Installing collected packages: keras-contrib
Successfully installed keras-contrib-2.0.8


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Functions Learner

In [14]:
def find_multi_token_entity(ents, token):
    keys = ents.keys()
    for k in keys:
        if token in k:
            return ents[k]

def load_data(datadir):
    '''
    Task: Load XML files in given directory, tokenize each sentence, and extract
    ground truth BIO labels for each token.
    
    Input: 
        datadir: A directory containing XML files
        
    Output: A directory containing the dataset. Dictionary key is sentence_id, and the 
            value is a list of token tuples (word, start, end, ground truth).
            
    Example: 
            >>> load_data('data/Train')
            {'DDI-DrugBank.d370.s0': [('as', 0, 1,'O'), ('differin', 3, 10,'B-brand'),
                     ('gel', 12, 14,'O'), ... , ('with', 343, 346, 'O'),
                     ('caution', 348, 354, 'O'), ('.', 355, 355, 'O')],
            'DDI-DrugBank.d370.s1': [('particular', 0, 9, 'O'), ('caution', 11, 17, 'O'),
                     ('should', 19, 24, 'O'), ... , ('differin', 130, 137, 'B-brand'),
                     ('gel', 139, 141, 'O'), ('.', 142, 142, 'O')], ... }
    '''
    
    dict_dataset = {}
    
    for f in listdir(datadir):
        tree = parse(datadir + "/" + f)
        
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences: 
            sid = s.attributes["id"].value
            
            stext = s.attributes["text"].value
            #stext = stext.replace("-"," ") if used we lose beta-endorphin, if not use we lose calcium-rich tag
            
            ents = {}
            entities = s.getElementsByTagName("entity")
            for e in entities:
                offset = e.attributes["charOffset"].value
                offsets = offset.split(';')
                name = e.attributes["text"].value
                e_type = e.attributes["type"].value
                ents[name] = {"type": e_type, "offsets": []}
                for offset in offsets:
                    start = offset.split('-')[0]
                    end = offset.split('-')[1]
                    ents[name]["offsets"].append((start, end))
            
            punct = [",",";",":","?","!", "(", ")"] # removed "."
            tokens = word_tokenize(stext)
            tokens_cleaned = []
            for t in tokens:
                if t not in punct and t not in stopwords:
                    tokens_cleaned.append(t)
            
            tags = []
            tokens = []
            for t in tokens_cleaned:
                offsetFrom = stext.find(t)
                offsetTo = offsetFrom + len(t) - 1
                # 1-token entities
                if t in ents:
                    if (int(ents[t]["offsets"][0][0]) == offsetFrom):
                        tag = "B-"+ents[t]["type"] # TODO: ents after .?
                    else:
                        tag = "I-"+ents[t]["type"]                    
                else:
                    multi_token_ent = find_multi_token_entity(ents, t)
                    if multi_token_ent:
                        if (int(multi_token_ent["offsets"][0][0]) == offsetFrom):
                            tag = "B-"+multi_token_ent["type"] # TODO: ents after .?
                        else:
                            tag = "I-"+multi_token_ent["type"]
                    else:
                        tag = "O"
                tags.append(tag)
                tupl = (t, offsetFrom, offsetTo, tag)
                tokens.append(tupl)
            
            dict_dataset[sid] = tokens
        
    return dict_dataset

# -- TODO: handle multi-token entities # I think I've done it :D
# tricyclic antidepressants - 'O' should be Group
# chondroitin ABC lyase - 'O' should be drug_n
# heparinase III - 'O' should be drug_n
# hyaluronan lyase - 'O' should be drug_n
# Mercaptopurine/Azathioprine - 'O' should be drug
# muscle relaxants 

In [15]:
path_train = "../../labAHLT/data/train"
path_dev = "../../labAHLT/data/devel"
path_train = "drive/MyDrive/UPC/labAHLT/data/train"
path_dev = "drive/MyDrive/UPC/labAHLT/data/devel"


train_dataset = load_data(path_train)
devel_dataset = load_data(path_dev)

print(train_dataset['DDI-DrugBank.d661.s5'])

[('Interactions', 0, 11, 'O'), ('observed', 23, 30, 'O'), ('nondepolarizing', 43, 57, 'B-group'), ('muscle', 59, 64, 'I-group'), ('relaxants', 66, 74, 'I-group'), ('administered', 86, 97, 'O'), ('succession', 102, 111, 'O'), ('.', 112, 112, 'O')]


In [16]:
def create_index(dataset, max_length):
    '''
    Task: Create index dictionaries both for input (words) and output (labels) from given dataset
    Input: 
        dataset: dataset produced by load_data.
        max_length: maximum length of a sentence (longer sentences will be cut, shorter ones will be padded).
        
    Output: A dictionary where each key is an index name (e.g. "words", "labels"), and the value is a 
            dictionary mapping each word/label to a number. An entry with the value for maxlen is also stored
    Example: 
        >>> create_indx(traindata)
        {'words': {'<PAD>':0, '<UNK>':1, '11-day':2, 'murine':3, 'criteria':4,
                   'stroke':5, ... ,'levodopa':8511, 'terfenadine': 8512}
         'labels': {'<PAD>':0, 'B-group':1, 'B-drug_n':2, 'I-drug_n':3, 'O':4, 
                    'I-group':5, 'B-drug':6, 'I-drug':7, 'B-brand':8, 'I-brand':9}
         'maxlen': 100 }
    '''
    
    index_words = {'<PAD>':0, '<UNK>':1}
    i = 2
    
    index_labels = {'<PAD>':0}
    j = 1
    
    
    
    for key, item in dataset.items():
        for t in item:
            word = t[0].lower() # use lower case words? 
            tag = t[3]
            if word not in index_words:
                index_words[word] = i
                i += 1
            if tag not in index_labels:
                index_labels[tag] = j
                j += 1

    indexs = {'words': index_words, 'labels': index_labels, 'maxlen':max_length}
    
    return indexs    

In [32]:
idx = create_index(train_dataset, 20)
idx['labels']

{'<PAD>': 0,
 'B-brand': 3,
 'B-drug': 1,
 'B-drug_n': 9,
 'B-group': 5,
 'I-brand': 7,
 'I-drug': 4,
 'I-drug_n': 8,
 'I-group': 6,
 'O': 2}

In [18]:
def encode_words(dataset, idx):
    '''
    Task: Encode the words in a sentence dataset formed by lists of tokens into lists of indexes
          suitable for NN input.
    Input: 
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_indexs, containing word and label indexes, as well
             as the maximum sentence length.
             
    Output: The dataset encoded as a list of sentence, each of them is a list of word indices.
            If the word is not in the index, <UNK> code is used. If the sentence is shorter than
            max_len it is padded with <PAD> code.
    Example: 
        >>> encode_words(traindata, idx)
            [ [6882 1049 4911 ... 0 0 0 ]
            [  2290 7548 8069 ... 0 0 0 ]
               ...
            [  2002 6582 7518 ... 0 0 0 ] ]
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        for t in item:
            w = str(t[0]).lower() # When using lower case words
            if w in idx['words']:
                i = idx['words'][w]
            else:
                i = idx['words']['<UNK>']
            aux.append(i)
        seq.append(aux)  
    
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post')
    
    return seq_padded

def encode_labels(dataset, idx):
    '''
    Task: Encode the ground truth labels in a dataset formed by lists of tokens into lists of indexes
        suitable for NN output.
    Input:
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_index, containing word and label indexes, as well as the maximum length.
        
    Output: The dataset encoded as a list of sentence, each of them is a list of BIO label indices. If the sentence
            is shorter than max_len it is padded with <PAD> code. 
    
    Example :
     >>> encode_labels ( traindata , idx )
        [[ [4] [6] [4] [4] [4] [4] ... [0] [0] ]
        [  [4] [4] [8] [4] [6] [4] ... [0] [0] ]
          ...
        [
    '''
    max_length = idx['maxlen']
    seq = []
    for key, item in dataset.items():
        aux = []
        for t in item:
            w = t[3]
            i = idx['labels'][w]
            aux.append(i)
        seq.append(aux)
    seq_padded = pad_sequences(maxlen = max_length, sequences = seq, padding = 'post', truncating="post")
    seq_categ = [to_categorical(i, num_classes = 10) for i in seq_padded]  # 9 classes + 1 PAD
    
    return seq_padded, seq_categ

In [33]:
X_train = encode_words(train_dataset, idx)
Y, Y_train = encode_labels(train_dataset, idx)

X_dev = encode_words(devel_dataset, idx)
Ydev, Y_dev = encode_labels(devel_dataset, idx)

In [None]:
print(X_train)

[[   2    2    3 ...    0    0    0]
 [  12   13    6 ...    0    0    0]
 [  17   18   19 ...   33   13   11]
 ...
 [ 941  128   31 ...    0    0    0]
 [ 154  898  899 ...    0    0    0]
 [   7  514  147 ... 1025 8303   11]]


In [None]:
print(Y[0])
print(Y_train[0][0:10])

[1 1 1 1 1 1 1 1 1 2 1 0 0 0 0 0 0 0 0 0]
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]


In [21]:
## GloVe
# http://nlp.stanford.edu/data/glove.6B.zip
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index['words']) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index['words']:
                idx = word_index['words'][word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix
glove_path = "../../labAHLT/data/glove.6B/glove.6B.300d.txt"
glove_path = "drive/MyDrive/UPC/labAHLT/data/glove.6B/glove.6B.300d.txt"

embedding_matrix = create_embedding_matrix(glove_path, idx, 256)

***

In [61]:
def build_network(idx):
    '''
    Task: Create network for the learner. 
    Input:
        idx: index dictionary with word/labels codes, plus maximum sentence length.
    Output: Returns a compiled Keras neural network with the specified layers
    '''
    
    #sizes
    n_words = len(idx['words'])
    n_labels = len(idx['labels'])#+1
    max_len = idx['maxlen']

    model = Sequential()
    
    # create network layers
    # inp = Input(shape=(max_len,))
    emb_layer = Embedding(input_dim=n_words+1, output_dim=256, weights=[embedding_matrix], input_length=max_len)
    model.add(emb_layer)
    #model = Embedding(input_dim=n_words + 1, output_dim = n_labels, input_length = max_len)(inp)
    bidir = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))
    model.add(bidir)
    # model2 = Bidirectional(LSTM(units=512, return_sequences=True,recurrent_dropout=0.2, dropout=0.2))(emb_layer)
    # model = add([model, model2])
    #model = TimeDistributed(Dense(10, activation="relu"))(model)
    #crf = tfa.layers.CRF(n_labels)
    crf = Layer(CRF(n_labels))
    model.add(crf)
    dense = TimeDistributed(Dense(n_labels, activation="softmax"))
    model.add(dense)
    
    # create and compile model
    # model = Model(inp, out)
    
    optimiz = Adam(lr=0.005, decay=1e-6)
    model.compile(optimizer=optimiz, loss='mse', metrics=["accuracy"])
    
    return model

***

In [62]:
model = build_network(idx)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 20, 256)           2128896   
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 20, 1024)          3149824   
_________________________________________________________________
layer_1 (Layer)              (None, 20, 1024)          0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 20, 10)            10250     
Total params: 5,288,970
Trainable params: 5,288,970
Non-trainable params: 0
_________________________________________________________________


  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [63]:
history = model.fit(X_train, np.array(Y_train),
                   batch_size=32,
                   epochs=1,
                   verbose=1, 
                   validation_data=(X_dev, np.array(Y_dev)))



In [None]:
def save_model_and_indexes(model, idx):#, filename):
    '''
    Task: Save given model and indexs to disk
    Input: 
        model: Keras model created by _build_network, and trained.
        idx: A dictionary produced by create_indexs, containing word and label indexes, 
             as well as the maximum sentence length. 
        filename: Saves the mode into filename.nn and the indexes into filename.idx
    '''
    model.save("ner-nn.nn")
    
    file = open("index_ner.pkl", "wb")
    pickle.dump(idx, file)
    file.close()

## `Learner()`

In [None]:
def learner(traindir, validationdir):#, modelname):
    '''
    Learns a NN model using traindir as training data, and validationdir as validation data.
    Saves learnt model in a file named modelname
    '''
    # load train and validation data in a suitable form
    train_dataset = load_data(traindir)
    val_dataset = load_data(validationdir)
    
    # create indexes from trainindg data 
    max_len = 20
    idx = create_index(train_dataset, max_len)
    
    # build network 
    model = build_network(idx)
    
    # encode datasets
    Xtrain = encode_words(train_dataset, idx)
    Y, Ytrain = encode_labels(train_dataset, idx)
    Xval = encode_words(val_dataset, idx)
    Yv, Yval = encode_labels(val_dataset, idx)
    
    # train model
    model.fit(Xtrain, np.array(Ytrain),
              batch_size=32,
              epochs=5,
              verbose=1,
              validation_data=(Xval, np.array(Yval)))
    
    # save model and indexs, for later use in prediction
    save_model_and_indexes(model, idx)#, path_dev.split('/')[-1]+"NER-learned")

In [None]:
path_train = "../../labAHLT/data/train"
path_dev = "../../labAHLT/data/devel"
learner(path_train, path_dev)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: ner-nn.nn/assets


***
## Functions Classifier

In [None]:
def load_model_and_indexs():
    '''
    Task: Load model and associate indexs from disk.
    Input:
        filename: filename to be loaded
    Output: Loads a model from filename.nn and its indexes from filename.idx
            Returns the loaded model and indexes. 
    '''
    model = load_model("ner-nn.nn")
    index = open("index_ner.pkl", "rb")
    idx = pickle.load(index)
    
    return model, idx

In [None]:
def output_entities(dataset, preds, outfile):
    '''
    Task: Output detected entities in the format expected by the evaluator
    Input: 
        dataset: A dataset produced by load_data.
        preds: For each sentence in dataset, a list with the labels for each sentence token, 
               as predicted by the model.
    Output: prints the detected entities to stdout in the format required by the evaluator. 
    '''
    outf = open(outfile, 'w')
    for sentence, pred in zip(dataset.items(), preds):
#         print(sentence, pred)
        sid = sentence[0]
        tokens = sentence[1]
        for i in range(min(len(tokens), len(pred))):
            token = tokens[i]
            label = pred[i]
            if label[0] == 'B':
                offset_from = str(token[1])
                offset_to = str(token[2])
                tag_name = label[2:]
                entity = token[0]
                j = i+1
                while j < len(tokens) and len(tokens[j]) >=3 and j>len(pred): ## I added j>len(pred) because an error of index range arised
                    token_next = tokens[j]
                    word_next = token_next[0]
                    offset_from_next = str(token_next[1])
                    offset_to_next = str(token_next[2])
                    tag_next = pred[j]
                    j += 1
                    if int(offset_from_next) - int(offset_to) != 2 or tag_next[0] != 'I':
                        break
                    if tag_next[2:] == tag_name:
                        entity = entity + ' ' + word_next
                        offset_to = offset_to_next
                outf.write(sid + "|" + offset_from + '-' + offset_to + "|" + entity + "|" + tag_name+'\n')
                print(sid + "|" + offset_from + '-' + offset_to + "|" + entity + "|" + tag_name)

In [None]:
Y_pred = model.predict(X_dev)
Y = np.argmax(Y_pred, axis=-1)
# for y in Y_pred:
#     for s in y:
#         print(s)
#         print(np.argmax(s))
key_list = list(idx['labels'].keys())
val_list = list(idx['labels'].values())
for y in Y:
    for t in y:
        if t in [3,4,5,6,7,8,9]: 
            print(key_list[t])
        break

## `Classifier()`

In [None]:
def predict(datadir, outfile):
    '''
    Loads a NN model from a file 'modelname' and uses it to extract drugs in datadir. Saves
    results to 'outfile' in the appropriate format
    '''
    
    # load model and associated encoding data
    model, idx = load_model_and_indexs()
    
    # load data to annotate
    testdata = load_data(datadir)
#     print(idx)
    
    # encode dataset
    X = encode_words(testdata, idx)
    
    # tag sentences in dataset
    Y = model.predict(X)
    Y = [[find_label(idx, np.argmax(y)) for y in s] for s in Y]
    
    # extract entities and dump them to output file
    output_entities(testdata, Y, outfile)
    
    # evaluate using official evaluator
    evaluate("NER", datadir, outfile)

In [None]:
def find_label(idx, predicted):
    for label, i in idx["labels"].items():
        if i == predicted:
            return label

In [None]:
path_test = "../../labAHLT/data/test"
predict(path_test, "NER-result2")

DDI-MedLine.d141.s0|24-27|zinc|drug
DDI-MedLine.d141.s3|38-41|zinc|drug
DDI-MedLine.d141.s4|24-27|zinc|drug
DDI-DrugBank.d149.s0|44-58|succinylcholine|drug
DDI-DrugBank.d149.s0|74-91|anticholinesterase|group
DDI-DrugBank.d526.s0|78-92|corticosteroids|group
DDI-DrugBank.d526.s1|0-11|Amphotericin|drug
DDI-DrugBank.d526.s1|38-46|diuretics|group
DDI-DrugBank.d526.s1|86-95|ethacrynic|drug
DDI-DrugBank.d526.s1|106-115|furosemide|drug
DDI-DrugBank.d526.s4|0-8|Digitalis|group
DDI-DrugBank.d526.s4|60-68|digitalis|group
DDI-DrugBank.d526.s7|5-18|anticoagulants|group
DDI-DrugBank.d526.s8|38-50|anticoagulant|group
DDI-DrugBank.d526.s9|0-11|Antidiabetic|group
DDI-DrugBank.d526.s9|36-42|insulin|drug
DDI-DrugBank.d526.s11|17-28|antidiabetic|group
DDI-DrugBank.d526.s12|0-6|Aspirin|brand
DDI-DrugBank.d526.s13|34-40|aspirin|brand
DDI-DrugBank.d526.s14|7-16|salicylate|group
DDI-DrugBank.d526.s14|65-72|steroids|group
DDI-DrugBank.d526.s14|101-107|aspirin|brand
DDI-DrugBank.d526.s15|62-68|aspirin|brand
DDI

Ignoring duplicated entity in system predictions file: DDI-DrugBank.d488.s0|38-44|Heparin|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d751.s1|42-48|heparin|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d751.s1|54-60|aspirin|brand
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d263.s6|78-87|lamivudine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d263.s25|0-10|Doxorubicin|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d258.s7|59-68|Loratadine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d495.s0|30-38|clonidine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d495.s9|23-31|clonidine|drug
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d634.s3|28-36|digitalis|group
Ignoring duplicated entity in system predictions file: DDI-DrugBank.d103.s6|0-10|-adrenergic|group
Ignoring duplicated entity i