<a href="https://colab.research.google.com/github/mon3/deepNN_NER/blob/master/masters_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -q pydrive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
your_module = drive.CreateFile({'id':'1lBy0_BsIQ_gzuCkM-P4MWbstIZFkHj8L'})
your_module.GetContentFile('preprocess.py')


In [0]:
validation_module = drive.CreateFile({'id': '1nEox1MVwcpP3Fu5RGJzhKi358x9jTA2h'})
validation_module.GetContentFile('validation.py')

In [0]:
"""Load packages"""

import matplotlib.pyplot as plt
import numpy as np
import os

from validation import compute_f1
from keras.models import Model, load_model
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding, Input, Dropout, LSTM, Bidirectional, MaxPooling1D, \
    Flatten, concatenate
from preprocess import readfile, createBatches, createMatrices, iterate_minibatches, addCharInformation, padding
from keras.utils import plot_model
from keras.initializers import RandomUniform
from keras.optimizers import SGD, Nadam

In [0]:
"""Set parameters"""

EPOCHS = 80               # paper: 80
DROPOUT = 0.68           # paper: 0.68
DROPOUT_RECURRENT = 0.25  # not specified in paper, 0.25 recommended; in other papers: 0.0
LSTM_STATE_SIZE = 275    # paper: 275
CONV_SIZE = 3             # paper: 3
LEARNING_RATE = 0.0105    # paper 0.0105
OPTIMIZER = Nadam()       # paper uses SGD(lr=self.learning_rate), Nadam() recommended
LOCATION_POINTER = '/content/gdrive/My Drive/masters_thesis/'
CHAR_EMBEDDING = 30   # paper: 25, previously: 30
VERSION = 9
EMBEDDING = "fastText"

In [0]:
"""Initialise class"""

class CNN_BLSTM(object):
    
    def __init__(self, EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER, CHAR_EMBEDDING, VERSION, EMBEDDING):
        
        self.epochs = EPOCHS
        self.dropout = DROPOUT
        self.dropout_recurrent = DROPOUT_RECURRENT
        self.lstm_state_size = LSTM_STATE_SIZE
        self.conv_size = CONV_SIZE
        self.learning_rate = LEARNING_RATE
        self.optimizer = OPTIMIZER
        self.char_embedding_size = CHAR_EMBEDDING
        self.version = VERSION
        self.embedding = EMBEDDING
        
    def loadData(self):
        """Load data and add character information"""
        self.trainSentences = readfile(os.path.join(LOCATION_POINTER, "data/train.txt"))
        self.devSentences = readfile(os.path.join(LOCATION_POINTER, "data/dev.txt"))
        self.testSentences = readfile(os.path.join(LOCATION_POINTER, "data/test.txt"))

    def addCharInfo(self):
        # format: [['EU', ['E', 'U'], 'B-ORG\n'], ...]
        self.trainSentences = addCharInformation(self.trainSentences)
        self.devSentences = addCharInformation(self.devSentences)
        self.testSentences = addCharInformation(self.testSentences)

    def embed(self):
        """Create word- and character-level embeddings"""

        labelSet = set()
        words = {}

        # unique words and labels in data  
        for dataset in [self.trainSentences, self.devSentences, self.testSentences]:
            for sentence in dataset:
                for token, char, label in sentence:
                    # token ... token, char ... list of chars, label ... BIO labels   
                    labelSet.add(label)
                    words[token.lower()] = True

        # mapping for labels
        self.label2Idx = {}
        for label in labelSet:
            self.label2Idx[label] = len(self.label2Idx)

        # mapping for token cases
        case2Idx = {'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5,
                    'contains_digit': 6, 'PADDING_TOKEN': 7}
        # creates identity matrix for token cases
        self.caseEmbeddings = np.identity(len(case2Idx), dtype='float32')  # identity matrix used 

        # read GLoVE word embeddings
        word2Idx = {}
        self.wordEmbeddings = []
        
        # word represented as 50-dim vector
        # ToDO: test with 300-dim vectors (GloVE 42B, GloVE 84B)
        if self.embedding == "fastText":
          fEmbeddings = open(os.path.join(LOCATION_POINTER, "embeddings/wiki-news-300d-1M.vec"), encoding="utf-8")
        else:
          fEmbeddings = open(os.path.join(LOCATION_POINTER, "embeddings/glove.6B.50d.txt"), encoding="utf-8")

        # loop through each word in embeddings
        for i, line in enumerate(fEmbeddings):
            if i==0 and self.embedding == "fastText":
                continue
                
            split = line.strip().split(" ") # removes leading and trailing chars and splits into list of single values
            word = split[0]  # embedding word entry

            if len(word2Idx) == 0:  # add padding+unknown
                word2Idx["PADDING_TOKEN"] = len(word2Idx)
                vector = np.zeros(len(split) - 1)  # zero vector for 'PADDING' word
                self.wordEmbeddings.append(vector)

                word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
                vector = np.random.uniform(-0.25, 0.25, len(split) - 1)  # zero vector for 'PADDING' word
                self.wordEmbeddings.append(vector)

            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1:]])
                self.wordEmbeddings.append(vector)  # word embedding vector
                word2Idx[split[0]] = len(word2Idx)  # corresponding word dict; increments by 1 for each word

        self.wordEmbeddings = np.array(self.wordEmbeddings)

        # dictionary of all possible characters
        self.char2Idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>":
            self.char2Idx[c] = len(self.char2Idx) # 2,3,4 ...

        # format: [[wordindices], [caseindices], [padded word indices], [label indices]]
        self.train_set = padding(createMatrices(self.trainSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx))
        self.dev_set = padding(createMatrices(self.devSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx))
        self.test_set = padding(createMatrices(self.testSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx))

        self.idx2Label = {v: k for k, v in self.label2Idx.items()}  # index to label(reverted)
                                                                
                                                                
    def createBatches(self):
        
        """Create batches"""
        self.train_batch, self.train_batch_len = createBatches(self.train_set)
        self.dev_batch, self.dev_batch_len = createBatches(self.dev_set)
        self.test_batch, self.test_batch_len = createBatches(self.test_set)
        
    def tag_dataset(self, dataset, model):
        """Tag data with numerical values"""
        correctLabels = []
        predLabels = []
        for i, data in enumerate(dataset):
            tokens, casing, char, labels = data
            tokens = np.asarray([tokens])
            casing = np.asarray([casing])
            char = np.asarray([char])
            pred = model.predict([tokens, casing, char], verbose=False)[0]
            pred = pred.argmax(axis=-1)  # Predict the classes
            correctLabels.append(labels)
            predLabels.append(pred)
        return predLabels, correctLabels
    
    def buildModel(self):
        """Model layers"""

        # character input
        character_input = Input(shape=(None, 52,), name="Character_input")  #input N sentences, each 52 chras length
        embed_char_out = TimeDistributed(
            Embedding(len(self.char2Idx),self.char_embedding_size, embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name="Character_embedding")(
            character_input)

        dropout = Dropout(self.dropout)(embed_char_out)

        # CNN
        conv1d_out = TimeDistributed(Conv1D(kernel_size=self.conv_size, filters=30, padding='same', activation='tanh', strides=1), name="Convolution")(dropout)
        maxpool_out = TimeDistributed(MaxPooling1D(52), name="Maxpool")(conv1d_out)  # pool_size=52: max sentence length
        char = TimeDistributed(Flatten(), name="Flatten")(maxpool_out)
        char = Dropout(self.dropout)(char)

        # word-level input
        words_input = Input(shape=(None,), dtype='int32', name='words_input')
        words = Embedding(input_dim=self.wordEmbeddings.shape[0], output_dim=self.wordEmbeddings.shape[1], weights=[self .wordEmbeddings],
                          trainable=False)(words_input)  # trainable=False since we provide word embeddings

        # case-info input
        casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
        casing = Embedding(output_dim=self.caseEmbeddings.shape[1], input_dim=self.caseEmbeddings.shape[0], weights=[self.caseEmbeddings],
                           trainable=False)(casing_input)  ## trainable=False since we provide case embeddings

        # concat & BLSTM
        output = concatenate([words, casing, char])
        output = Bidirectional(LSTM(self.lstm_state_size, 
                                    return_sequences=True, 
                                    dropout=self.dropout,                        # on input to each LSTM block
                                    recurrent_dropout=self.dropout_recurrent     # on recurrent input signal
                                   ), name="BLSTM")(output)
        output = TimeDistributed(Dense(len(self.label2Idx), activation='softmax'),name="Softmax_layer")(output)

        # set up model
        self.model = Model(inputs=[words_input, casing_input, character_input], outputs=[output])
        
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer=self.optimizer)
        
        self.init_weights = self.model.get_weights()
        
        plot_model(self.model, to_file=os.path.join(LOCATION_POINTER, 'model_{}.png'.format(self.version)))
        
        print("Model built. Saved model.png\n")
        
    def train(self):
        """Default training"""

        self.f1_test_history = []
        self.f1_dev_history = []

        for epoch in range(self.epochs):    
            print("Epoch {}/{}".format(epoch, self.epochs))
            # batch: [word_indices, case_indices, char_indices]
            for i,batch in enumerate(iterate_minibatches(self.train_batch,self.train_batch_len)):
                labels, tokens, casing,char = batch       
                self.model.train_on_batch([tokens, casing,char], labels)

            # compute F1 scores
            predLabels, correctLabels = self.tag_dataset(self.test_batch, self.model)
            pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, self.idx2Label)
            self.f1_test_history.append(f1_test)
            print("f1 test ", round(f1_test, 4))

            predLabels, correctLabels = self.tag_dataset(self.dev_batch, self.model)
            pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, self.idx2Label)
            self.f1_dev_history.append(f1_dev)
            print("f1 dev ", round(f1_dev, 4), "\n")
            
        print("Final F1 test score: ", f1_test)
            
        print("Training finished.")
            
        # save model
        self.modelName = "{}_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(self.epochs, 
                                                        self.dropout, 
                                                        self.dropout_recurrent, 
                                                        self.lstm_state_size,
                                                        self.conv_size,
                                                        self.learning_rate,
                                                        self.char_embedding_size,
                                                        self.optimizer.__class__.__name__,
                                                        self.version,
                                                        self.embedding
                                                       )
        
        modelName = self.modelName + ".h5"
        self.model.save(os.path.join(LOCATION_POINTER, modelName))
        print("Model weights saved.")
        
        self.model.set_weights(self.init_weights)  # clear model
        print("Model weights cleared.")

    def writeToFile(self):
        """Write output to file"""

        output = np.matrix([[int(i) for i in range(self.epochs)], self.f1_test_history, self.f1_dev_history])

        fileName = os.path.join(LOCATION_POINTER, self.modelName + ".txt")
        with open(fileName,'wb') as f:
            for line in output:
                np.savetxt(f, line, fmt='%.5f')
                
        print("Model performance written to file.")
        
    def saveResults(self):
        plt.plot(cnn_blstm.f1_test_history, label = "F1 test")
        plt.plot(cnn_blstm.f1_dev_history, label = "F1 dev")
        plt.xlabel("Epochs")
        plt.ylabel("F1 score")
        plt.legend()
        plt.savefig(os.path.join(LOCATION_POINTER, self.modelName + ".png"))

    print("Class initialised.")

Class initialised.


In [0]:
"""Construct and run model"""

cnn_blstm = CNN_BLSTM(EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER, CHAR_EMBEDDING, VERSION, EMBEDDING)
cnn_blstm.loadData()
cnn_blstm.addCharInfo()
cnn_blstm.embed()
cnn_blstm.createBatches()
cnn_blstm.buildModel()
cnn_blstm.train()
cnn_blstm.writeToFile()
cnn_blstm.saveResults()

Model built. Saved model.png

Epoch 0/80
f1 test  0.4167
f1 dev  0.436 

Epoch 1/80
f1 test  0.6461
f1 dev  0.6847 

Epoch 2/80
f1 test  0.647
f1 dev  0.6837 

Epoch 3/80
f1 test  0.742
f1 dev  0.7425 

Epoch 4/80
f1 test  0.7685
f1 dev  0.7948 

Epoch 5/80
f1 test  0.782
f1 dev  0.8153 

Epoch 6/80
f1 test  0.7652
f1 dev  0.7728 

Epoch 7/80
f1 test  0.821
f1 dev  0.8297 

Epoch 8/80
f1 test  0.8205
f1 dev  0.8286 

Epoch 9/80
f1 test  0.8213
f1 dev  0.8282 

Epoch 10/80
f1 test  0.8152
f1 dev  0.8334 

Epoch 11/80
f1 test  0.8416
f1 dev  0.8592 

Epoch 12/80
f1 test  0.8283
f1 dev  0.8496 

Epoch 13/80
f1 test  0.8515
f1 dev  0.874 

Epoch 14/80
f1 test  0.8478
f1 dev  0.8667 

Epoch 15/80
f1 test  0.861
f1 dev  0.886 

Epoch 16/80
f1 test  0.828
f1 dev  0.8456 

Epoch 17/80
f1 test  0.865
f1 dev  0.8932 

Epoch 18/80
f1 test  0.8659
f1 dev  0.8814 

Epoch 19/80
f1 test  0.8709
f1 dev  0.8996 

Epoch 20/80
f1 test  0.8686
f1 dev  0.8935 

Epoch 21/80
f1 test  0.8684
f1 dev  0.8923 

