## Neural Network DDI

In [25]:
import numpy as np
from os import listdir
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
#from keras_contrib.layers import CRF

%matplotlib inline

## Functions Learner

`load_data()`- Use XML parsing and tokenization functions from previous exercises. Adding a PoS tagger or lemmatizer may be useful. \
Masking the target drugs as e.g. `<DRUG1>`, `<DRUG2>`, and the rest as `<DRUG_OTHER>` will help the algorithm generalize and avoid it focusing in the drug names, which are not relevant for the DDI task (and also make it easier for it to spot the target entities).

In [32]:
file = '/Users/mponsclo/Documents/Master/labAHLT/data/train/Balsalazide_ddi.xml'
dataset = []

tree = parse(file)
wordnet_lemmatizer = WordNetLemmatizer()

sentences = tree.getElementsByTagName("sentence")

for s in sentences: 
    sid = s.attributes["id"].value
            
    stext = s.attributes["text"].value
    stext = stext.replace("-"," ")
            
    ents = {}
    entities = s.getElementsByTagName("entity")
    
    for e in entities:
        e_id = e.attributes["id"].value
        offset = e.attributes["charOffset"].value
        name = e.attributes["text"].value
        ents[e_id] = name

    pairs = s.getElementsByTagName("pair")
    for p in pairs:
        pair_sent = stext
        e1_id = p.attributes['e1'].value
        e2_id = p.attributes['e2'].value
        ddi = p.attributes['ddi'].value
        aux = [sid, e1_id, e2_id, ddi]
        for key, item in ents.items():
            if key == e1_id:
                drug_1 = item
                pair_sent = pair_sent.replace(drug_1, '<DRUG1>')
            elif key == e2_id:
                drug_2 = item
                pair_sent = pair_sent.replace(drug_2, '<DRUG2>')
            else:
                other_drug = item
                pair_sent = pair_sent.replace(other_drug, '<DRUG_OTHER>')
        
        #punct = [".",",",";",":","?","!"] # Remove stopwords
        tokens = word_tokenize(pair_sent)
        aux2 = []
        for t in tokens:
            lemma = wordnet_lemmatizer.lemmatize(t)
            tk = (t, lemma) # TODO: Add PoS Tag and check lemmatizer
            aux2.append(tk)
        aux.append(aux2)
        dataset.append(aux)

[['DDI-DrugBank.d486.s0', 'DDI-DrugBank.d486.s0.e0', 'DDI-DrugBank.d486.s0.e1', 'false', [('No', 'No'), ('drug', 'drug'), ('interaction', 'interaction'), ('studies', 'study'), ('have', 'have'), ('been', 'been'), ('conducted', 'conducted'), ('for', 'for'), ('<', '<'), ('DRUG1', 'DRUG1'), ('>', '>'), (',', ','), ('however', 'however'), ('the', 'the'), ('use', 'use'), ('of', 'of'), ('orally', 'orally'), ('administered', 'administered'), ('<', '<'), ('DRUG2', 'DRUG2'), ('>', '>'), ('could', 'could'), (',', ','), ('theoretically', 'theoretically'), (',', ','), ('interfere', 'interfere'), ('with', 'with'), ('the', 'the'), ('release', 'release'), ('of', 'of'), ('<', '<'), ('DRUG_OTHER', 'DRUG_OTHER'), ('>', '>'), ('in', 'in'), ('the', 'the'), ('colon', 'colon'), ('.', '.')]], ['DDI-DrugBank.d486.s0', 'DDI-DrugBank.d486.s0.e0', 'DDI-DrugBank.d486.s0.e2', 'false', [('No', 'No'), ('drug', 'drug'), ('interaction', 'interaction'), ('studies', 'study'), ('have', 'have'), ('been', 'been'), ('conduct

In [69]:
def load_data(datadir):
    '''
    Task: Load XML files in given directory, tokenize each sentence, and extract
    learning examples (tokenized sentence + entity pair).
    
    Input: 
        datadir: A directory containing XML files
        
    Output: A list of classification cases. Each case is a list containing sentenceid, entity1_id, entity2_id,
            ground truth relation label, and a list of sentence tokens (each token containing any needed information:
            word, lemma, PoS, offsets, etc.
    '''
    dataset = []
    wordnet_lemmatizer = WordNetLemmatizer()

    for f in listdir(datadir):
        
        tree = parse(datadir + "/" + f)
        sentences = tree.getElementsByTagName("sentence")

        for s in sentences: 
            sid = s.attributes["id"].value   
            stext = s.attributes["text"].value
            stext = stext.replace("-"," ")
            
            ents = {}
            entities = s.getElementsByTagName("entity")
            for e in entities:
                e_id = e.attributes["id"].value
                offset = e.attributes["charOffset"].value
                name = e.attributes["text"].value
                ents[e_id] = name

            pairs = s.getElementsByTagName("pair")
            for p in pairs:
                pair_sent = stext
                e1_id = p.attributes['e1'].value
                e2_id = p.attributes['e2'].value
                if p.attributes['ddi'].value == 'true':
                    ddi = p.attributes['type'].value
                else:
                    ddi = 'none'
                
                aux = [sid, e1_id, e2_id, ddi]
                for key, item in ents.items():
                    if key == e1_id:
                        drug_1 = item
                        pair_sent = pair_sent.replace(drug_1, 'DRUG1')
                    elif key == e2_id:
                        drug_2 = item
                        pair_sent = pair_sent.replace(drug_2, 'DRUG2')
                    else:
                        other_drug = item
                        pair_sent = pair_sent.replace(other_drug, 'DRUGOTHER')
        
                #punct = [".",",",";",":","?","!"] # Remove stopwords
                tokens = word_tokenize(pair_sent)
                aux2 = []
                for t in tokens:
                    lemma = wordnet_lemmatizer.lemmatize(t)
                    tk = (t, lemma) # TODO: Add PoS Tag and check lemmatizer
                    aux2.append(tk)
                aux.append(aux2)
                dataset.append(aux)
    
    return dataset

In [70]:
train_path = '/Users/mponsclo/Documents/Master/labAHLT/data/train'
dataset = load_data(train_path)
print(dataset[65])

['DDI-DrugBank.d234.s2', 'DDI-DrugBank.d234.s2.e0', 'DDI-DrugBank.d234.s2.e1', 'effect', [('Particular', 'Particular'), ('caution', 'caution'), ('is', 'is'), ('necessary', 'necessary'), ('when', 'when'), ('using', 'using'), ('DRUG1', 'DRUG1'), ('in', 'in'), ('cases', 'case'), ('of', 'of'), ('mixed', 'mixed'), ('drug', 'drug'), ('overdosage', 'overdosage'), ('since', 'since'), ('the', 'the'), ('toxic', 'toxic'), ('effects', 'effect'), ('(', '('), ('such', 'such'), ('as', 'a'), ('convulsions', 'convulsion'), ('and', 'and'), ('cardiac', 'cardiac'), ('dysrhythmias', 'dysrhythmias'), (')', ')'), ('of', 'of'), ('other', 'other'), ('drugs', 'drug'), ('taken', 'taken'), ('in', 'in'), ('overdose', 'overdose'), ('(', '('), ('especially', 'especially'), ('DRUG2', 'DRUG2'), (')', ')'), ('may', 'may'), ('emerge', 'emerge'), ('with', 'with'), ('the', 'the'), ('reversal', 'reversal'), ('of', 'of'), ('the', 'the'), ('DRUGOTHER', 'DRUGOTHER'), ('effect', 'effect'), ('by', 'by'), ('DRUGOTHER', 'DRUGOTHE

In [None]:
def create_index(dataset, max_length):
    '''
    Task: Create index dictionaries both for input (words) and output (labels) from given dataset
    Input: 
        dataset: dataset produced by load_data.
        max_length: maximum length of a sentence (longer sentences will be cut, shorter ones will be padded).
        
    Output: A dictionary where each key is an index name (e.g. "words", "labels"), and the value is a 
            dictionary mapping each word/label to a number. An entry with the value for maxlen is also stored
    Example: 
        >>> create_indx(traindata)
        {'words': {'<PAD>':0, '<UNK>':1, '11-day':2, 'murine':3, 'criteria':4,
                   'stroke':5, ... ,'levodopa':8511, 'terfenadine': 8512}
         'labels': {'<PAD>':0, 'B-group':1, 'B-drug_n':2, 'I-drug_n':3, 'O':4, 
                    'I-group':5, 'B-drug':6, 'I-drug':7, 'B-brand':8, 'I-brand':9}
         'maxlen': 100 }
    '''
    
# Add '<PAD>': 0 and '<UNK>':1 codes to 'words' index. The coding of the rest of the words/labels is arbitrary.
# You may add to the dictionary entries with indexes for other elements you want to use (lemmas, PoS, etc).

In [None]:
def encode_words(dataset, idx):
    '''
    Task: Encode the words in a sentence dataset formed by lists of tokens into lists of indexes
          suitable for NN input.
    Input: 
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_indexs, containing word and label indexes, as well
             as the maximum sentence length.
             
    Output: The dataset encoded as a list of sentence, each of them is a list of word indices.
            If the word is not in the index, <UNK> code is used. If the sentence is shorter than
            max_len it is padded with <PAD> code.
    Example: 
        >>> encode_words(traindata, idx)
            [ [6882 1049 4911 ... 0 0 0 ]
            [  2290 7548 8069 ... 0 0 0 ]
               ...
            [  2002 6582 7518 ... 0 0 0 ] ]
    '''
    
def encode_labels(dataset, idx):
    '''
    Task: Encode the ground truth labels in a dataset of classification examples (sentence + entity pair).
    Input:
        dataset: A dataset produced by load_data.
        idx: A dictionary produced by create_index, containing word and label indexes, as well as the maximum length.
        
    Output: The dataset encoded as a list DDI labels, one per classification example. 
    
    Example :
     >>> encode_labels ( traindata , idx )
     [  [0] [0] [2] ... [4] [0] [0] [1] [0] ]
     [[ [4] [6] [4] [4] [4] [4] ... [0] [0] ]
     [  [4] [4] [8] [4] [6] [4] ... [0] [0] ]
         ...
     [  [4] [8] [9] [4] [4] [4] ... [0] [0] ]
     ] 
    '''

In [None]:
def build_network(idx):
    '''
    Task: Create network for the learner. 
    Input:
        idx: index dictionary with word/labels codes, plus maximum sentence length.
    Output: Returns a compiled Keras neural network with the specified layers
    '''
    
    #sizes
    n_words = len(idx['words'])
    n_labels = len(idx['labels'])
    max_len = idx['maxlen']
    
    # create network layers
    inp = Input(shape=(max_len,))
    ##addd layers
    
    # create and compile model
    model = Model(inp, out)
    model.compile() # set appropriate parameters (optimizer, loss, etc)
    
    return model

In [None]:
def save_model_and_indexes(model, idx, filename):
    '''
    Task: Save given model and indexs to disk
    Input: 
        model: Keras model created by _build_network, and trained.
        idx: A dictionary produced by create_indexs, containing word and label indexes, 
             as well as the maximum sentence length. 
        filename: Saves the mode into filename.nn and the indexes into filename.idx
    '''
    
    # Use Keras.model.save and keras.models.load_model functions to save/load the model
    # Use your preferred method (pickel, plain text, etc) to save/load the index dict.

## `Learner()`

In [None]:
def learner(traindir, validationdir, modelname):
    '''
    Learns a NN model using traindir as training data, and validationdir as validation data.
    Saves learnt model in a file named modelname
    '''
    # load train and validation data in a suitable form
    traindata = load_data(traindir)
    valdata = load_data(validationdir)
    
    # create indexes from trainindg data 
    max_len = 100
    idx = create_indexs(traindata, max_len)
    
    # build network 
    model = build_model(idx)
    
    # encode datasets
    Xtrain = encode_words(traindata, idx)
    Ytrain = encode_labels(traindata, idx)
    Xval = encode_words(valdata, idx)
    Yval = encode_labels(valdata, idx)
    
    # train model
    model.fit(Xtrain, Ytrain, validation_data=(Xval, Yval))
    
    # save model and indexs, for later use in prediction
    save_model_and_indexs(model, idx, modelname)

***
## Classifier Functions

In [None]:
def load_model_and_indexs(filename):
    '''
    Task: Load model and associate indexs from disk.
    Input:
        filename: filename to be loaded
    Output: Loads a model from filename.nn and its indexes from filename.idx
            Returns the loaded model and indexes. 
    '''
    
    # Use Keras.model.save and keras.models.load_model functions to save/load the model
    # Use your preferred method (pickel, plain text, etc) to save/load the index dict.

In [None]:
def output_interactions(dataset, preds): 
    '''
    Task: Output detected DDIs in the format expected by the evaluator.
    Input:
        dataset: A dataset produced by load_data
        preds: Fore each sentence in dataset, a label for its DDI type (or 'null' if no DDI detected)
        
    Output: prints the detected interactions to dtdout in the format required by the evaluator.
    Example:
        >>> output_interactions ( dataset , preds )
            DDI - DrugBank . d398 .s0|DDI - DrugBank . d398 .s0.e0|DDI - DrugBank . d398 .s0.e1|effect
            DDI - DrugBank . d398 .s0|DDI - DrugBank . d398 .s0.e0|DDI - DrugBank . d398 .s0.e2|effect
            DDI - DrugBank . d211 .s2|DDI - DrugBank . d211 .s2.e0|DDI - DrugBank . d211 .s2.e5|mechanism
            ...
    '''

## `Classifier()`

In [None]:
def predict(modelname, datadir, outfile):
    '''
    Loads a NN model from a file 'modelname' and uses it to extract drugs in datadir. Saves
    results to 'outfile' in the appropriate format
    '''
    
    # load model and associated encoding data
    model, idx = load_model_and_indexs(modelname)
    
    # load data to annotate
    testdata = load_data(datadir)
    
    # encode dataset
    X = encode_words(testdata, idx)
    
    # tag sentences in dataset
    Y = model.predict(X)
    # get most likely tag for each word
    Y = [[idx['labels'][np.argmax(y)] for y in s] for s in Y]
    
    # extract entities and dump them to output file
    output_entities(testdata, Y, outfile)
    
    # evaluate using official evaluator
    evaluation(datadir, outfile)