In [316]:
from __future__ import print_function

import codecs
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

import numpy as np


In [317]:
#CoNLL-2000 shared task

def load_data(path):
    data = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = line.strip()
        if line:
            try:
                token, pos, chunk = line.strip().split()
                data.append((token, pos, chunk))
            except:
                pass
    return data
        
train_data = load_data('data/seq/train.txt')

print(len(train_data))
for i in train_data[:10]:
    print(' - '.join(i))

211727
Confidence - NN - B-NP
in - IN - B-PP
the - DT - B-NP
pound - NN - I-NP
is - VBZ - B-VP
widely - RB - I-VP
expected - VBN - I-VP
to - TO - I-VP
take - VB - I-VP
another - DT - B-NP


In [318]:
train_tokens, train_pos, train_chunk = zip(*train_data)
test_tokens, test_pos, test_chunk = zip(*test_data)

In [319]:
tag_encoder = LabelEncoder()
tag_encoder.fit(train_pos)
print('Total nb tags:', len(tag_encoder.classes_))

y_train_pos = tag_encoder.transform(train_pos)

Y_train_pos = np_utils.to_categorical(y_train_pos,
                                  nb_classes=len(tag_encoder.classes_))

Total nb tags: 44


In [320]:
from collections import Counter
vocab = Counter(train_tokens)
indexer = {'unk': 0}

#max_vocab_size = 2000
for k, v in vocab.most_common():
    indexer[k] = len(indexer)

In [321]:
nb_left, nb_right = 2, 1

def vectorize(tokens):
    sequences = []
    for curr_idx, token in enumerate(tokens):
        #print(curr_idx)
        #cnt += 1
        #if cnt >= 20:
        #    break

        left_context = tokens[(curr_idx - 2) : curr_idx]
        while len(left_context) < nb_left:
            left_context = ['<unk>'] + left_context

        right_context = tokens[curr_idx + 1 : curr_idx + 2]
        while len(right_context) < nb_right:
            right_context += ['<unk>']

        seq = left_context + [token] + right_context

        ints = [indexer[t] if t in indexer else 0 for t in seq]

        sequences.append(ints)
    
    return np.array(sequences, dtype='int8')

X_train = vectorize(list(train_tokens))
X_test = vectorize(list(test_tokens))

In [322]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Activation, Flatten, Dropout

from keras.layers.recurrent import LSTM

model = Sequential()
model.add(Embedding(input_dim=len(indexer), output_dim=150,
                    input_length=nb_left + 1 + nb_right))
# recurrent
model.add(LSTM(100, return_sequences=False, activation='tanh'))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dense(len(tag_encoder.classes_)))
model.add(Activation('softmax'))

In [323]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [324]:
model.fit(X_train, Y_train_pos, batch_size=10, nb_epoch=10,
          shuffle=True, validation_data=(test_X, Y_test))

Train on 211727 samples, validate on 47377 samples
Epoch 1/10
  2000/211727 [..............................] - ETA: 566s - loss: 2.9672 - acc: 0.1780

KeyboardInterrupt: 

### DIY
Load the test data and evaluate

## The functional API

In [325]:
from keras.layers import Input
context_input = Input(shape=(nb_left + 1 + nb_right,), dtype='int32')

In [326]:
embedding = Embedding(input_dim=len(indexer), output_dim=150)(context_input)

In [327]:
left_to_right = LSTM(100, return_sequences=False, activation='tanh')(embedding)
right_to_left = LSTM(100, return_sequences=False, activation='tanh', go_backwards=True)(embedding)

In [328]:
from keras.layers import merge
merged = merge([left_to_right, right_to_left], mode='sum')

In [329]:
output = Dense(len(tag_encoder.classes_), activation='softmax')(merged)

In [330]:
from keras.models import Model
model = Model(input=context_input, output=output)

In [331]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
input_29 (InputLayer)              (None, 4)           0                                            
____________________________________________________________________________________________________
embedding_54 (Embedding)           (None, 4, 150)      2868450     input_29[0][0]                   
____________________________________________________________________________________________________
lstm_59 (LSTM)                     (None, 100)         100400      embedding_54[0][0]               
____________________________________________________________________________________________________
lstm_60 (LSTM)                     (None, 100)         100400      embedding_54[0][0]               
___________________________________________________________________________________________

In [332]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [333]:
model.fit(X_train, Y_train_pos, batch_size=10, nb_epoch=10,
          shuffle=True, validation_data=(X_test, Y_test_pos))

Train on 211727 samples, validate on 47377 samples
Epoch 1/10
  4310/211727 [..............................] - ETA: 564s - loss: 2.6950 - acc: 0.2631

KeyboardInterrupt: 

## Multiple inputs and outputs

### Multiple inputs

In [336]:
X_train_focus = [indexer[focus] if focus in indexer else 0 for focus in train_tokens]
print(len(X_train_focus))

211727


In [338]:
X_train_focus = np.array(X_train_focus, dtype='int32')
print(X_train_focus.shape)

(211727,)


In [339]:
context_input = Input(shape=(4,), dtype='int32', name='context')
focus_input = Input(shape=(1,), dtype='int32', name='focus')

In [340]:
context_embedding = Embedding(input_dim=len(indexer), output_dim=150)(context_input)
focus_embedding = Embedding(input_dim=len(indexer), output_dim=150)(focus_input)

In [341]:
left_to_right = LSTM(100, return_sequences=False, activation='tanh')(context_embedding)
right_to_left = LSTM(100, return_sequences=False, activation='tanh', go_backwards=True)(context_embedding)
merged1 = merge([left_to_right, right_to_left], mode='sum')

In [342]:
flat_context = Flatten()(focus_embedding)

In [343]:
merged2 = merge([merged1, flat_context], mode='concat')

In [344]:
pos_output = Dense(len(tag_encoder.classes_), activation='softmax', name='pos')(merged2)

In [345]:
from keras.models import Model
model = Model(input=[context_input, focus_input], output=pos_output)

In [346]:
print(X_train.shape)
print(X_train_focus.shape)
print(Y_train_pos.shape)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit({'context':X_train,
           'focus': X_train_focus},
          {'pos': Y_train_pos},
          batch_size=10, nb_epoch=10, shuffle=True)

(211727, 4)
(211727,)
(211727, 44)
Epoch 1/10
  3310/211727 [..............................] - ETA: 1042s - loss: 2.4935 - acc: 0.3906

KeyboardInterrupt: 

### Multiple outputs

In [347]:
chunk_encoder = LabelEncoder()
chunk_encoder.fit(train_chunk)
print('Total nb chunk labels:', len(chunk_encoder.classes_))

y_train_chunk = chunk_encoder.transform(train_chunk)

Y_train_chunk = np_utils.to_categorical(y_train_chunk,
                                  nb_classes=len(chunk_encoder.classes_))

Total nb chunk labels: 22


In [348]:
context_input = Input(shape=(4,), dtype='int32', name='context')
focus_input = Input(shape=(1,), dtype='int32', name='focus')

context_embedding = Embedding(input_dim=len(indexer), output_dim=150)(context_input)
left_to_right = LSTM(100, return_sequences=False, activation='tanh')(context_embedding)
right_to_left = LSTM(100, return_sequences=False, activation='tanh', go_backwards=True)(context_embedding)
merged1 = merge([left_to_right, right_to_left], mode='sum')

focus_embedding = Embedding(input_dim=len(indexer), output_dim=150)(focus_input)
flat_context = Flatten()(focus_embedding)

merged2 = merge([merged1, flat_context], mode='concat')

pos_output = Dense(len(tag_encoder.classes_), activation='softmax', name='pos')(merged2)
chunk_output = Dense(len(chunk_encoder.classes_), activation='softmax', name='chunk')(merged2)

model = Model(input=[context_input, focus_input], output=[pos_output, chunk_output])

In [349]:
print(X_train.shape)
print(X_train_focus.shape)
print(Y_train_pos.shape)
print(Y_train_chunk.shape)


model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit({'context': X_train,
           'focus': X_train_focus},
          {'pos': Y_train_pos,
           'chunk': Y_train_chunk},
          batch_size=100, nb_epoch=10, shuffle=True)

(211727, 4)
(211727,)
(211727, 44)
(211727, 22)
Epoch 1/10
  4290/211727 [..............................] - ETA: 948s - loss: 3.8640 - pos_loss: 2.4032 - chunk_loss: 1.4609 - pos_acc: 0.3786 - chunk_acc: 0.5343

KeyboardInterrupt: 

## Sequence to sequence learning