<a href="https://colab.research.google.com/github/oliviamomeu1/Momeu_Olivia_Lab10/blob/master/Copia_de_NERC_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Name Entity Recognition using Deep Learning

* Upload the lab_resources and NERC_nn files to you Drive Account:
  * Lab_resource: https://www.cs.upc.edu/~turmo/mud/lab/lab_resources.zip
  * NERC_nn code: https://www.cs.upc.edu/~turmo/mud/lab/07-DDI-nn.zip
* Before running the code, ensure that your Google Colab is set to use GPU:
  * Edit → Notebook Settings
* Mount your Drive disk unit:
  * Left-side menu → Files → Mount drive (the icon that looks like a folder with the Drive logo).


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Define the paths to the data and utils in your Drive unit:

##**RUBEN PATHS**

In [2]:
utilsdir='drive/MyDrive/MDS/Q2/MUD/06-NERC-nn'
evaluatordir='drive/MyDrive/MDS/Q2/MUD/lab_resources/DDI/util'
traindir='drive/MyDrive/MDS/Q2/MUD/lab_resources/DDI/data/train'
validationdir='drive/MyDrive/MDS/Q2/MUD/lab_resources/DDI/data/devel'
testdir='drive/MyDrive/MDS/Q2/MUD/lab_resources/DDI/data/test'
modelname ='model'
outfile ='out.txt'

##**OLIVIA PATHS**

In [7]:
utilsdir='drive/MyDrive/MDS/SEM2/MUD/06-NERC-nn/06-NERC-nn'
evaluatordir='drive/MyDrive/MDS/SEM2/MUD/06-NERC-nn/lab_resources/DDI/util'
traindir='drive/MyDrive/MDS/SEM2/MUD/06-NERC-nn/lab_resources/DDI/data/train'
validationdir='drive/MyDrive/MDS/SEM2/MUD/06-NERC-nn/lab_resources/DDI/data/devel'
testdir='drive/MyDrive/MDS/SEM2/MUD/06-NERC-nn/lab_resources/DDI/data/test'
modelname ='model'
outfile ='out.txt'

##**ADRIA PATHS**

In [4]:
utilsdir='drive/MyDrive/MUD/MUD_Project/06-NERC-nn'
evaluatordir='drive/MyDrive/MUD/MUD_Project/lab_resources/DDI/util'
traindir='drive/MyDrive/MUD/MUD_Project/lab_resources/DDI/data/train'
validationdir='drive/MyDrive/MUD/MUD_Project/lab_resources/DDI/data/devel'
testdir='drive/MyDrive/MUD/MUD_Project/lab_resources/DDI/data/test'
modelname ='model'
outfile ='out.txt'

In [13]:
!pip install tensorflow-addons
import sys
import nltk
nltk.download('averaged_perceptron_tagger')
sys.path.insert(1,utilsdir) # Path to the utils folder on your Google Drive disk
sys.path.insert(1,evaluatordir) # Path to the evaluator folder on your Google Drive disk



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:

from contextlib import redirect_stdout

from tensorflow.keras import Input, utils
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, GRU, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, concatenate, Softmax
from tensorflow_addons.text.crf_wrapper import CRFModelWrapper
from codemaps import *

import nltk
nltk.download('punkt')

utils.set_random_seed(812)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:


def build_network(codes) :

   # sizes
   n_words = codes.get_n_words()
   n_sufs = codes.get_n_sufs()
   n_prefs = codes.get_n_prefs()
   n_labels = codes.get_n_labels()
   n_lowers = codes.get_n_lowers()
   n_pos = codes.get_n_lowers()
   max_len = codes.maxlen


   ######################################
   inptW = Input(shape=(max_len,))
   inptS = Input(shape=(max_len,))
   inptP = Input(shape=(max_len,))
   inptL = Input(shape=(max_len,))
   inptPo = Input(shape=(max_len,))

   model1 = Embedding(input_dim=n_words, output_dim=150,
                      input_length=max_len, mask_zero=False)(inptW)  # word embeddings

   model2 = Embedding(input_dim=n_sufs, output_dim=50,
                      input_length=max_len, mask_zero=False)(inptS)  # suf embeddings

   model3 = Embedding(input_dim=n_prefs, output_dim=50,
                    input_length=max_len, mask_zero=False)(inptP) # pref embeddings

   model4 = Embedding(input_dim=n_lowers, output_dim=150,
                    input_length=max_len, mask_zero=False)(inptL) # lowers embeddings

   model5 = Embedding(input_dim=n_pos, output_dim=150,
                    input_length=max_len, mask_zero=False)(inptPo) # lowers embeddings

   model1 = Dropout(0.1)(model1)
   model2 = Dropout(0.1)(model2)
   model3 = Dropout(0.1)(model3)
   model4 = Dropout(0.1)(model4)
   model5 = Dropout(0.1)(model5)

   model = concatenate([model1,model2,model3,model4, model5])
   y = Bidirectional(LSTM(units=200, return_sequences=True))(model)  #  biLSTM
   out = TimeDistributed(Dense(n_labels, activation=Softmax()))(y)

   return Model(
        inputs=[inptW,inptS,inptP,inptL, inptPo], outputs=out
    )


In [16]:
# directory with files to process


# load train and validation data
traindata = Dataset(traindir)
valdata = Dataset(validationdir)

# create indexes from training data
max_len = 150
suf_len = 5
pref_len = 3
codes  = Codemaps(traindata, max_len, suf_len, pref_len)

# encode datasets
#[Xt,Xts,Xtp] = codes.encode_words(traindata)
Xt = codes.encode_words(traindata)
Yt = codes.encode_labels(traindata)
#[Xv,Xvs,Xvp] = codes.encode_words(valdata)
Xv = codes.encode_words(valdata)
Yv = codes.encode_labels(valdata)

n_tags = codes.get_n_labels()
max_len = codes.maxlen

In [17]:
model = build_network(codes)
model.compile(optimizer='adam' ,metrics=["accuracy"], loss="sparse_categorical_crossentropy")
model.build([(None,max_len),(None,max_len),(None,max_len)])

with redirect_stdout(sys.stderr) :
   model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 150)]                0         []                            
                                                                                              

In [18]:
## --------- MAIN PROGRAM -----------
## --
## -- Usage:  train.py ../data/Train ../data/Devel  modelname
## --

# train model
with redirect_stdout(sys.stderr) :
   #model.fit([Xt,Xts,Xtp], Yt, batch_size=32, epochs=10, validation_data=([Xv,Xvs,Xvp],Yv), verbose=1)
   model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv,Yv), verbose=1)

# save model and indexs
model.save(modelname)
codes.save(modelname)
#save_model_and_indexs(model, idx, modelname)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predict

In [19]:
#import sys
import evaluator

In [20]:
def output_entities(data, preds, outfile) :

   outf = open(outfile, 'w')
   for sid,tags in zip(data.sentence_ids(),preds) :
      inside = False
      for k in range(0,min(len(data.get_sentence(sid)),codes.maxlen)) :
         y = tags[k]
         token = data.get_sentence(sid)[k]

         if (y[0]=="B") :
             entity_form = token['form']
             entity_start = token['start']
             entity_end = token['end']
             entity_type = y[2:]
             inside = True
         elif (y[0]=="I" and inside) :
             entity_form += " "+token['form']
             entity_end = token['end']
         elif (y[0]=="O" and inside) :
             print(sid, str(entity_start)+"-"+str(entity_end), entity_form, entity_type, sep="|", file=outf)
             inside = False

      if inside : print(sid, str(entity_start)+"-"+str(entity_end), entity_form, entity_type, sep="|", file=outf)

   outf.close()

In [21]:
## --------- Evaluator -----------
def evaluation(datadir,outfile) :
   evaluator.evaluate("NER", datadir, outfile)


In [22]:
## --------- MAIN PROGRAM -----------
## --
## -- Usage:  baseline-NER.py target-dir
## --
## -- Extracts Drug NE from all XML files in target-dir
## --

#datadir = validationdir
datadir = testdir

testdata = Dataset(datadir)
[X,Xs,Xp,Xl,Xpo] = codes.encode_words(testdata)
Y = model.predict([X,Xs,Xp,Xl,Xpo])
Y = [[codes.idx2label(np.argmax(w)) for w in s] for s in Y]

# extract entities
output_entities(testdata, Y, outfile)

# evaluate
evaluation(datadir,outfile)


                   tp	  fp	  fn	#pred	#exp	P	R	F1
------------------------------------------------------------------------------
brand              79	   3	 195	  82	 274	96.3%	28.8%	44.4%
drug             1737	 104	 390	1841	2127	94.4%	81.7%	87.6%
drug_n              1	   9	  71	  10	  72	10.0%	1.4%	2.4%
group             563	 133	 130	 696	 693	80.9%	81.2%	81.1%
------------------------------------------------------------------------------
M.avg            -	-	-	-	-	70.4%	48.3%	53.9%
------------------------------------------------------------------------------
m.avg            2380	 249	 786	2629	3166	90.5%	75.2%	82.1%
m.avg(no class)  2459	 170	 707	2629	3166	93.5%	77.7%	84.9%


In [23]:
## --------- MAIN PROGRAM -----------
## --
## -- Usage:  baseline-NER.py target-dir
## --
## -- Extracts Drug NE from all XML files in target-dir
## --

datadir = validationdir
#datadir = testdir

testdata = Dataset(datadir)
[X,Xs,Xp,Xl,Xpo] = codes.encode_words(testdata)
Y = model.predict([X,Xs,Xp,Xl,Xpo])
Y = [[codes.idx2label(np.argmax(w)) for w in s] for s in Y]

# extract entities
output_entities(testdata, Y, outfile)

# evaluate
evaluation(datadir,outfile)

                   tp	  fp	  fn	#pred	#exp	P	R	F1
------------------------------------------------------------------------------
brand              55	   0	 319	  55	 374	100.0%	14.7%	25.6%
drug             1572	 122	 334	1694	1906	92.8%	82.5%	87.3%
drug_n              6	   1	  39	   7	  45	85.7%	13.3%	23.1%
group             542	  82	 145	 624	 687	86.9%	78.9%	82.7%
------------------------------------------------------------------------------
M.avg            -	-	-	-	-	91.3%	47.4%	54.7%
------------------------------------------------------------------------------
m.avg            2175	 205	 837	2380	3012	91.4%	72.2%	80.7%
m.avg(no class)  2233	 147	 779	2380	3012	93.8%	74.1%	82.8%
