<a href="https://colab.research.google.com/github/mon3/deepNN_NER/blob/master/masters_elmo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -q pydrive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
validation_module = drive.CreateFile({'id': '1nEox1MVwcpP3Fu5RGJzhKi358x9jTA2h'})
validation_module.GetContentFile('validation.py')

In [0]:
prepro_elmo_module3 = drive.CreateFile({'id':'1v9lb4SlclxAAKYjIRk0lrYp5oP3WEU3J'})
prepro_elmo_module3.GetContentFile('elmo_preprocess_sentences_nontrain.py')

In [6]:
"""Load packages"""

import matplotlib.pyplot as plt
import numpy as np
import os

from validation import compute_f1
from keras.models import Model, load_model
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding, Input, Dropout, LSTM, Bidirectional, MaxPooling1D, \
    Flatten, concatenate, Lambda
# from elmo_prod import createMatrices, createBatches, createBatchesNonTrain, iterate_minibatches, readfile, addCharInformation, padding

from elmo_preprocess_sentences_nontrain import createMatrices, createBatches, createBatchesNonTrain, iterate_minibatches, readfile, addCharInformation, padding
from keras.utils import plot_model
from keras.initializers import RandomUniform
from keras.optimizers import SGD, Nadam
import tensorflow_hub as hub
import tensorflow as tf


Using TensorFlow backend.
W0407 14:22:31.461174 140320475977600 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [25]:
"""Initialise class"""

class CNN_BLSTM(object):
    
    def __init__(self, EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER, CHAR_EMBEDDING, VERSION, EMBEDDING):
        
        self.epochs = EPOCHS
        self.dropout = DROPOUT
        self.dropout_recurrent = DROPOUT_RECURRENT
        self.lstm_state_size = LSTM_STATE_SIZE
        self.conv_size = CONV_SIZE
        self.learning_rate = LEARNING_RATE
        self.optimizer = OPTIMIZER
        self.char_embedding_size = CHAR_EMBEDDING
        self.version = VERSION
        self.embedding = EMBEDDING
        
    def loadData(self):
        """Load data and add character information"""
        self.trainSentences = readfile(os.path.join(LOCATION_POINTER, "data/train.txt"))
        self.devSentences = readfile(os.path.join(LOCATION_POINTER, "data/dev.txt"))
        self.testSentences = readfile(os.path.join(LOCATION_POINTER, "data/test.txt"))

    def addCharInfo(self):
        # format: [['EU', ['E', 'U'], 'B-ORG\n'], ...]
        self.trainSentences = addCharInformation(self.trainSentences)
        self.devSentences = addCharInformation(self.devSentences)
        self.testSentences = addCharInformation(self.testSentences)

    def embed(self):
        """Create word- and character-level embeddings"""

        labelSet = set()
        words = {}

        # unique words and labels in data  
        for dataset in [self.trainSentences, self.devSentences, self.testSentences]:
            for sentence in dataset:
                for token, char, label in sentence:
                    # token ... token, char ... list of chars, label ... BIO labels   
                    labelSet.add(label)
                    words[token.lower()] = True
                    
        # mapping for labels
        self.label2Idx = {}
        for label in labelSet:
            self.label2Idx[label] = len(self.label2Idx)

        # mapping for token cases
        case2Idx = {'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5,
                    'contains_digit': 6, 'PADDING_TOKEN': 7}
        # creates identity matrix for token cases
        self.caseEmbeddings = np.identity(len(case2Idx), dtype='float32')  # identity matrix used 

        word2Idx = {}

        # dictionary of all possible characters
        self.char2Idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>":
            self.char2Idx[c] = len(self.char2Idx) # 2,3,4 ...

        # format: [[wordindices], [caseindices], [padded word indices], [label indices]]
        self.train_set = padding(createMatrices(self.trainSentences, self.label2Idx, case2Idx, self.char2Idx))
        self.dev_set = padding(createMatrices(self.devSentences, self.label2Idx, case2Idx, self.char2Idx))
        self.test_set = padding(createMatrices(self.testSentences, self.label2Idx, case2Idx, self.char2Idx))

        self.idx2Label = {v: k for k, v in self.label2Idx.items()}  # index to label(reverted)
                                                                
                                                                
    def createBatches(self):
        
        """Create batches"""
        self.train_batch, self.train_batch_len = createBatches(self.train_set)
        self.dev_batch, self.dev_batch_len = createBatches(self.dev_set)
        self.test_batch, self.test_batch_len = createBatches(self.test_set)

        
    def tag_dataset(self, dataset, model):
        """Tag data with numerical values"""
        
        correctLabels = []
        predLabels = []
        for i in dataset.keys():
          key = str(i)
          data = dataset[key]
         
          for dt in data:

            t, c, ch, l, _ = dt
            t = np.expand_dims(t, -1)
#             l = np.expand_dims(l, -1)  # bo tak jest robione z danymi do trenowania modelu
            
            tokens_in = np.asarray([t])
            caseing_in = np.asarray([c])
            char_in = np.asarray([ch])
            labels_in = np.asarray([l])
         
            pred = model.predict([tokens_in, caseing_in, char_in], verbose=False)[0]
#             print("PRED: ".format(pred))
            pred = pred.argmax(axis=-1)  # Predict the classes
#             print("REAL LABELS IN: {}".format(l))
# #             print("Correct labels: {}".format(l))
# #             print("Predict labels: {}".format(pred))
#             print("Predict labels: {}".format(pred.tolist()))

            correctLabels.append(l) ## maybe labels
            predLabels.append(pred.tolist())  # pred converted to list - to 
            # preserve the same format as correct labels (list of ints)
       
          
        return predLabels, correctLabels
      
    
    def ELMoEmbedding(self, tokens_input):
        """ Return elmo embedding using tf_hub """
        elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
        # signature points to the purpose of why we would like to use the modules
        # as_dict=True needed to output word embeddings instead of defualtss
        print("Elmo shape: {} - {}".format(tokens_input.shape[0], tokens_input.shape[1]))

        return elmo_model(tf.squeeze(tf.cast(tokens_input, tf.string), axis=1), signature="default", as_dict=True)['elmo']
  
    def buildModel(self):
        """Model layers"""

        # character input
        character_input = Input(shape=(None, 52,), name="Character_input")  #input N sentences, each 52 chras length
        embed_char_out = TimeDistributed(
            Embedding(len(self.char2Idx),self.char_embedding_size, embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name="Character_embedding")(
            character_input)

        dropout = Dropout(self.dropout)(embed_char_out)

        # CNN
        conv1d_out = TimeDistributed(Conv1D(kernel_size=self.conv_size, filters=30, padding='same', activation='tanh', strides=1), name="Convolution")(dropout)
        maxpool_out = TimeDistributed(MaxPooling1D(52), name="Maxpool")(conv1d_out)  # pool_size=52: max sentence length
        char = TimeDistributed(Flatten(), name="Flatten")(maxpool_out)
        char = Dropout(self.dropout)(char)

        # word-level input
        # input of N-dimensional vectors (1st arg in shape points to the size of input vectors)
        words_input = Input(shape=(None, ), dtype="string", name='words_input')
        words = Lambda(self.ELMoEmbedding, output_shape=(None, 1024))(words_input)


        # case-info input
        casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
        casing = Embedding(output_dim=self.caseEmbeddings.shape[1], input_dim=self.caseEmbeddings.shape[0], weights=[self.caseEmbeddings],
                           trainable=False)(casing_input)  ## trainable=False since we provide case embeddings
       
        output = concatenate([words, casing, char])
        output = Bidirectional(LSTM(self.lstm_state_size, 
                                    return_sequences=True, 
                                    dropout=self.dropout,                        # on input to each LSTM block
                                    recurrent_dropout=self.dropout_recurrent     # on recurrent input signal
                                   ), name="BLSTM")(output)
        output = TimeDistributed(Dense(len(self.label2Idx), activation='softmax'),name="Softmax_layer")(output)

        # set up model

        self.model = Model(inputs=[words_input, casing_input, character_input], outputs=[output])
        
        for layer in self.model.layers:
            print("Layer {}: {}".format(layer.name, layer.output_shape))
        
        
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer=self.optimizer)
        
        self.init_weights = self.model.get_weights()
        
        plot_model(self.model, to_file=os.path.join(LOCATION_POINTER, 'model_{}.png'.format(self.version)))
        
        print("Model built. Saved model.png\n")
        
    def train(self):
        """Default training"""

        self.f1_test_history = []
        self.f1_dev_history = []

        for epoch in range(self.epochs):    
            print("Epoch {}/{}".format(epoch, self.epochs))
            # batch: [word_indices, case_indices, char_indices]
            print("Batch len: {}".format(self.train_batch_len))
            for i,batch in enumerate(iterate_minibatches(self.train_batch, self.train_batch_len)):
                labels, tokens, casing, char, _ = batch  
#                 # ToDO: verify if needed!!! 
#                 if len(tokens) <= 1:
#                   print("Tokens removed: {}".format(tokens))
#                   continue
#                 print("TRAIN LABELS: {}".format(labels[:10])) #  [[3][3][3][3][3][3]], [[3][3][3][3][3][3]], [[3][5][0][3][5][0]]

                self.model.train_on_batch([tokens, casing, char], labels)

            # compute F1 scores
            predLabels, correctLabels = self.tag_dataset(self.test_batch, self.model)
#             print("PRED LABELS: {}".format(predLabels))
#             print("CORRECT LABELS: {}".format(correctLabels))
#             fileName_predict = os.path.join(LOCATION_POINTER, "predict_labels.txt")
#             fileName_correct = os.path.join(LOCATION_POINTER, "correct_labels.txt")
#             with open(fileName,'wb') as f:
#               for item in predLabels:
#                 f.write("%s\n" % item)
                
            pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, self.idx2Label)
            self.f1_test_history.append(f1_test)
            print("f1 test ", round(f1_test, 4))

            predLabels, correctLabels = self.tag_dataset(self.dev_batch, self.model)
            pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, self.idx2Label)
            self.f1_dev_history.append(f1_dev)
            print("f1 dev ", round(f1_dev, 4), "\n")
            
        print("Final F1 test score: ", f1_test)
            
        print("Training finished.")
            
        # save model
        self.modelName = "{}_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(self.epochs, 
                                                        self.dropout, 
                                                        self.dropout_recurrent, 
                                                        self.lstm_state_size,
                                                        self.conv_size,
                                                        self.learning_rate,
                                                        self.char_embedding_size,
                                                        self.optimizer.__class__.__name__,
                                                        self.version,
                                                        self.embedding
                                                       )
        
        modelName = self.modelName + ".h5"
        self.model.save(os.path.join(LOCATION_POINTER, modelName))
        print("Model weights saved.")
        
        self.model.set_weights(self.init_weights)  # clear model
        print("Model weights cleared.")

    def writeToFile(self):
        """Write output to file"""

        output = np.matrix([[int(i) for i in range(self.epochs)], self.f1_test_history, self.f1_dev_history])

        fileName = os.path.join(LOCATION_POINTER, self.modelName + ".txt")
        with open(fileName,'wb') as f:
            for line in output:
                np.savetxt(f, line, fmt='%.5f')
                
        print("Model performance written to file.")
        
    def saveResults(self):
        plt.plot(cnn_blstm.f1_test_history, label = "F1 test")
        plt.plot(cnn_blstm.f1_dev_history, label = "F1 dev")
        plt.xlabel("Epochs")
        plt.ylabel("F1 score")
        plt.legend()
        plt.savefig(os.path.join(LOCATION_POINTER, self.modelName + ".png"))

    print("Class initialised.")

Class initialised.


In [8]:
"""Set parameters"""

EPOCHS = 80               # paper: 80
DROPOUT = 0.68           # paper: 0.68
DROPOUT_RECURRENT = 0.0  # not specified in paper, 0.25 recommended; in other papers: 0.0
LSTM_STATE_SIZE = 275    # paper: 275
CONV_SIZE = 3             # paper: 3
LEARNING_RATE = 0.0105    # paper 0.0105
OPTIMIZER = Nadam()       # paper uses SGD(lr=self.learning_rate), Nadam() recommended
LOCATION_POINTER = '/content/gdrive/My Drive/masters_thesis/'
CHAR_EMBEDDING = 30   # paper: 25, previously: 30
VERSION = 26
EMBEDDING = "elmo"

Instructions for updating:
Colocations handled automatically by placer.


W0407 14:22:44.622028 140320475977600 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [0]:
"""Construct and run model"""

cnn_blstm = CNN_BLSTM(EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER, CHAR_EMBEDDING, VERSION, EMBEDDING)
cnn_blstm.loadData()
cnn_blstm.addCharInfo()
cnn_blstm.embed()
cnn_blstm.createBatches()
cnn_blstm.buildModel()
cnn_blstm.train()
cnn_blstm.writeToFile()
cnn_blstm.saveResults()

Elmo shape: ? - ?
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0407 15:01:43.659723 140320475977600 saver.py:1483] Saver not created because there are no variables in the graph to restore


Layer Character_input: (None, None, 52)
Layer Character_embedding: (None, None, 52, 30)
Layer dropout_17: (None, None, 52, 30)
Layer Convolution: (None, None, 52, 30)
Layer Maxpool: (None, None, 1, 30)
Layer words_input: (None, None)
Layer casing_input: (None, None)
Layer Flatten: (None, None, 30)
Layer lambda_9: (None, None, 1024)
Layer embedding_18: (None, None, 8)
Layer dropout_18: (None, None, 30)
Layer concatenate_9: (None, None, 1062)
Layer BLSTM: (None, None, 550)
Layer Softmax_layer: (None, None, 9)
Model built. Saved model.png

Epoch 0/80
Batch len: dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '62', '67', '78', '113'])
f1 test  0.6582
f1 dev  0.6951 

Epoch 1/80
Bat