In [1]:
"""Load packages"""
import gc
import numpy as np
from validation import compute_f1
from keras.models import Model, load_model
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding, Input, Dropout, LSTM, GRU, Bidirectional, MaxPooling1D, \
    Flatten, concatenate
from prepro import readfile, createBatches, createMatrices, iterate_minibatches, addCharInformation, padding, getCasing
from keras.utils import plot_model
from keras.initializers import RandomUniform
from keras.optimizers import SGD, Nadam, Adam, RMSprop
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
class CNN_BLSTM(object):
    
    def __init__(self, EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER):
        
        self.epochs = EPOCHS
        self.dropout = DROPOUT
        self.dropout_recurrent = DROPOUT_RECURRENT
        self.lstm_state_size = LSTM_STATE_SIZE
        self.conv_size = CONV_SIZE
        self.learning_rate = LEARNING_RATE
        self.optimizer = OPTIMIZER
        
    def convertIOB(self, sentence):
        sentence_list = []
        for word in sentence.split(" "):
            wordLabel = []
            wordLabel.append(word)
            wordLabel.append("O\n")
            sentence_list.append(wordLabel)
        self.Sentences = [sentence_list]
        
    def addCharInfo(self):
        # format: [['EU', ['E', 'U'], 'B-ORG\n'], ...]
        self.Sentences = addCharInformation(self.Sentences)
        
    def createBatches(self):
        """Create batches"""
        self.batch, self.batch_len = createBatches(self.set)
        
    def tag_dataset(self, dataset, model):
        """Tag data with numerical values"""
        correctLabels = []
        predLabels = []
        for i, data in enumerate(dataset):
            tokens, casing, char, labels = data
            tokens = np.asarray([tokens])
            casing = np.asarray([casing])
            char = np.asarray([char])
            pred = model.predict([tokens, casing, char], verbose=False)[0]
            pred = pred.argmax(axis=-1)  # Predict the classes
            correctLabels.append(labels)
            predLabels.append(pred)
        return predLabels, correctLabels
    
    def buildModel(self):
        """Model layers"""
        
        char_size = 52
        # character input
        character_input = Input(shape=(None, char_size,), name="Character_input")
        embed_char_out = TimeDistributed(
            Embedding(len(self.char2Idx), 30, embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name="Character_embedding")(
            character_input)

        dropout = Dropout(self.dropout)(embed_char_out)

        # CNN
        conv1d_out = TimeDistributed(Conv1D(kernel_size=self.conv_size, filters=30, padding='same', activation='tanh', strides=1), name="Convolution")(dropout)
        maxpool_out = TimeDistributed(MaxPooling1D(char_size), name="Maxpool")(conv1d_out)
        char = TimeDistributed(Flatten(), name="Flatten")(maxpool_out)
        char = Dropout(self.dropout)(char)

        # word-level input
        words_input = Input(shape=(None,), dtype='int32', name='words_input')
        words = Embedding(input_dim=self.wordEmbeddings.shape[0], output_dim=self.wordEmbeddings.shape[1], weights=[self.wordEmbeddings],
                          trainable=False)(words_input)

        # case-info input
        casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
        casing = Embedding(output_dim=self.caseEmbeddings.shape[1], input_dim=self.caseEmbeddings.shape[0], weights=[self.caseEmbeddings],
                           trainable=False)(casing_input)

        # concat & BLSTM
        output = concatenate([words, char, casing])
        output = Bidirectional(GRU(self.lstm_state_size, 
                                    return_sequences=True, 
                                    dropout=self.dropout,                        # on input to each LSTM block
                                    recurrent_dropout=self.dropout_recurrent     # on recurrent input signal
                                   ), name="BLSTM")(output)
        
        output = Bidirectional(GRU(self.lstm_state_size, 
                                    return_sequences=True, 
                                    dropout=self.dropout,                        # on input to each LSTM block
                                    recurrent_dropout=self.dropout_recurrent     # on recurrent input signal
                                   ), name="BLSTM2")(output)
        

        output = TimeDistributed(Dense(50, activation="relu"),name="Time_Dense_Layer")(output)  # a dense layer as suggested by neuralNer

        # CRF Layer
        crf = CRF(len(self.label2Idx), sparse_target=True)
        out = crf(output)
        
        self.model = Model(inputs=[words_input, casing_input, character_input], outputs=[out])
        self.model.compile(optimizer=self.optimizer, loss=crf_loss, metrics=[crf.accuracy, crf_viterbi_accuracy])        
         
        # set up model
        self.init_weights = self.model.get_weights()

In [3]:
EPOCHS = 1               # paper: 80
DROPOUT = 0.3             # paper: 0.68
DROPOUT_RECURRENT = 0.25  # not specified in paper, 0.25 recommended
LSTM_STATE_SIZE = 275     # paper: 275
CONV_SIZE = 3             # paper: 3
LEARNING_RATE = 0.0105    # paper 0.0105
OPTIMIZER = Nadam()       # paper uses SGD(lr=self.learning_rate), Nadam() recommended

Instructions for updating:
Colocations handled automatically by placer.


In [4]:
cnn_blstm = CNN_BLSTM(EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER)

import pickle
with open('cnn_blstm_wordEmbeddings.pickle', 'rb') as f:
    cnn_blstm.wordEmbeddings = pickle.load(f)
    
with open('cnn_blstm_caseEmbeddings.pickle', 'rb') as f:
    cnn_blstm.caseEmbeddings = pickle.load(f)

with open('cnn_blstm_word2Idx.pickle', 'rb') as f:
    cnn_blstm.word2Idx = pickle.load(f)
    
with open('cnn_blstm_label2Idx.pickle', 'rb') as f:
    cnn_blstm.label2Idx = pickle.load(f)
    
with open('cnn_blstm_case2Idx.pickle', 'rb') as f:
    cnn_blstm.case2Idx = pickle.load(f)
    
with open('cnn_blstm_char2Idx.pickle', 'rb') as f:
    cnn_blstm.char2Idx = pickle.load(f)   

In [5]:
example = "Ali ata bak"
cnn_blstm.convertIOB(example)
cnn_blstm.addCharInfo()
cnn_blstm.Sentences[0]

[['Ali', ['A', 'l', 'i'], 'O\n'],
 ['ata', ['a', 't', 'a'], 'O\n'],
 ['bak', ['b', 'a', 'k'], 'O\n']]

In [6]:
prediction_set = padding(createMatrices(cnn_blstm.Sentences,
                                        cnn_blstm.word2Idx,
                                        cnn_blstm.label2Idx,
                                        cnn_blstm.case2Idx,
                                        cnn_blstm.char2Idx))
prediction_batch, prediction_batch_len = createBatches(prediction_set)
cnn_blstm.buildModel()
cnn_blstm.model.load_weights('1_0.3_0.25_275_3_0.0105_Nadam_0.8086624203821656.h5')

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [7]:
predLabels, _ = cnn_blstm.tag_dataset(prediction_batch, cnn_blstm.model)