In [25]:
import numpy as np

from keras import models
from keras import callbacks
from keras import optimizers
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import InputLayer
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Input
from keras.layers import Bidirectional
from keras.layers import TimeDistributed
from keras.layers import concatenate
from keras_contrib.layers import CRF

from importlib import reload

from minuet import loader, encoder, datastream, minuet
from minuet import preprocessing as p
from minuet import minuet

TRAIN_PATH = './data/pos/train.txt'
DEVEL_PATH = './data/pos/dev.txt'
GLOVE_PATH = './embeddings/wglove.6B.300d.bin'

MAX_SENT_LEN = 10
MAX_WORD_LEN = 7

#### Loading data

In [2]:
X_t, Y_t = loader.load_dataset(TRAIN_PATH)
X_v, Y_v = loader.load_dataset(DEVEL_PATH)

pre_word = p.assemble(
    p.lower,
    p.replace_numbers
)

pre_char = p.assemble(
    p.replace_numbers
)

Vw = loader.get_vocabulary(X_t, pre_word)
char2index = loader.get_characters_vocabulary(X_t, pre_char)

word2index, E = loader.load_embeddings(GLOVE_PATH, Vw)

#### Generating indices, encoding samples

In [3]:
reload(encoder)

# word embedding indices
E_sent_t = encoder.sentence_to_index(X_t, word2index, pre_word, MAX_SENT_LEN)
E_sent_v = encoder.sentence_to_index(X_v, word2index, pre_word, MAX_SENT_LEN)

# char embedding indices
E_word_t = encoder.sentence_to_characters(X_t, char2index, MAX_WORD_LEN, MAX_SENT_LEN, f=pre_char)
E_word_v = encoder.sentence_to_characters(X_v, char2index, MAX_WORD_LEN, MAX_SENT_LEN, f=pre_char)

# labels indices
Y_train, label_encoder = encoder.encode_labels(Y_t, MAX_SENT_LEN)
Y_valid, _ = encoder.encode_labels(Y_v, MAX_SENT_LEN, label_encoder)

if MAX_SENT_LEN:
    print('Using padded samples')
    Y_train = np.expand_dims(Y_train, -1)
    Y_valid = np.expand_dims(Y_valid, -1)
else:
    raise NotImplementedError(':)')
    
print(X_v[0])
print(Y_v[0])
print(Y_valid[0])

Using padded samples
['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
['NN', ':', 'NNP', 'VB', 'NNP', 'NNP', ',', 'NNP', 'IN', 'DT', 'NN', '.']
[[19]
 [ 7]
 [20]
 [35]
 [20]
 [20]
 [ 5]
 [20]
 [13]
 [10]]


#### Building the model

In [28]:
CHAR_EMBEDDING = 25
LSTM_CHAR_SIZE = 16
LSTM_CHAR_DROP = 0.5

def build_character_embedding():
    # shape_in: (batch_size, sentence_words_maxlen, chars_maxlen [ids])
    chars_input = Input(shape=(None, None), name='char_input')
    char_embedding = Embedding(
        len(char2index), CHAR_EMBEDDING, embeddings_initializer='glorot_uniform',
        mask_zero=True, name='char_embedding'
    )(chars_input)
    #shape_out: (batch_size, sentence_maxlen, chars_maxlen, char_embedding_dim)

    char_embedding = TimeDistributed(
        Bidirectional(
            LSTM(
                CHAR_EMBEDDING, dropout=LSTM_CHAR_DROP,
                recurrent_dropout=LSTM_CHAR_DROP, name='char_LSTM'
            )
        ),
        name='char_BiLSTM'
    )(char_embedding)
    # shape_out: (batch_size, sentence_words_maxlen, charlstm_hidden_dim_size)
    
    return chars_input, char_embedding

def build_word_embedding(E):
    # shape_in: (batch_size, sentence_words_maxlen)
    words_input = Input(shape=(None,), name='sent_input')
    
    word_embedding = Embedding(
        E.shape[0], E.shape[1],
        weights=[E], trainable=False,
        mask_zero=True, name='word_embedding'
    )(words_input)
    # shape_out: (batch_size, sentence_words_maxlen, word_embedding_dim)
    
    return words_input, word_embedding

def build_sentence_lstm(word_embedding, char_embedding, lstm_size, dropout_proba, bidirectional):
    if char_embedding is None:
        word_representations = word_embedding
    elif word_embedding is None:
        word_representations = char_embedding
    else:
        word_representations = concatenate([word_embedding, char_embedding], axis=-1)
    
    # shape_in: (batch_size, sentence_words_maxlen, [word_embedding_dim + char_embedding_dim] = d)
    lstm = LSTM(
        lstm_size, dropout=dropout_proba, recurrent_dropout=dropout_proba,
        return_sequences=True, name='sent_LSTM'
    )
    
    if bidirectional:
        lstm = Bidirectional(lstm, name='sent_BiLSTM')
    sentence_representations = lstm(word_representations)
    # shape_in: (batch_size, sentence_words_maxlen, d, lstm_hidden_size*(1 or 2))
        
    return sentence_representations

def build_softmax_output(sentence_representations, n_labels):
    # shape_in: (batch_size, sentence_words_maxlen, lstm_hidden_size*(1 or 2))
    # By default Dense only operates on the last layer
    out = Dense(n_labels, activation='softmax',
                name='softmax')(sentence_representations)
    
    return out, 'sparse_categorical_crossentropy', ['sparse_categorical_accuracy']
    # shape_out: (batch_size, sentence_words_maxlen, n_classes)

def build_crf_output(sentence_representations, n_labels):
    crf = CRF(n_labels, sparse_target=True)
    out = crf(sentence_representations)
    
    return out, crf.loss_function, [crf.accuracy]


words_input, word_embedding = build_word_embedding(E)
chars_input, char_embedding = build_character_embedding()
sentence_vectors = build_sentence_lstm(word_embedding, char_embedding, 30, 0.5, bidirectional=True)
out, loss, acc = build_softmax_output(sentence_vectors, len(label_encoder.classes_))
# out, loss, acc = build_crf_output(sentence_vectors, len(label_encoder.classes_))

model = models.Model(
    inputs=[words_input, chars_input],
    outputs=[out]
)

opt = optimizers.Adam(clipnorm=5.0)
model.compile(opt, loss=loss, metrics=acc)
model.summary()


w = model.predict([E_sent_t[:1], E_word_t[:1]])
print(w.shape)

model.fit(
    [E_sent_t, E_word_t], Y_train, batch_size=32,
    epochs=5,
    validation_data=([E_sent_v, E_word_v], Y_valid)
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, None)   0                                            
__________________________________________________________________________________________________
sent_input (InputLayer)         (None, None)         0                                            
__________________________________________________________________________________________________
char_embedding (Embedding)      (None, None, None, 2 1925        char_input[0][0]                 
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, None, 300)    4793100     sent_input[0][0]                 
__________________________________________________________________________________________________
char_BiLST

<keras.callbacks.History at 0x7fd700f903c8>