In [39]:
import re

import dask.dataframe as dd
import numpy as np
import pandas as pd

from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU

raw = pd.read_table('/Users/minhmai/Downloads/train.txt', delimiter=' ', header=None)
raw.columns = ['word', 'pos', 'wsj']


def refactor_pos(x):
    # helps deal with imbalances between classes
    if x == '.':
        return 'stopper'
    elif x.startswith('V') or x.startswith('RB') or x == 'JJ':
        return 'verb/adverb/adjective'
    elif x in ['NS', 'NNS', 'NN']:
        return 'noun'
    elif x in ['IN', 'TO', 'DT']:
        return 'preposition'
    else:
        return 'other'

def preprocess_dataframe(data):
    regex = re.compile(r'[^.a-z0-9]')
    data['is_symbol'] = data.pos.apply(lambda x: True if regex.match(x) else False)
    data = data[((data.is_symbol == True) | (data.pos == '.'))]

    # refactor pos tagging
    data['pos'] = data.pos.apply(lambda x: refactor_pos(x))
    data = data[['word', 'pos']]
    data['word'] = data.word.apply(lambda x: x.lower())
    corpus = {k: v for k, v in zip(data['word'].unique(), range(data['word'].nunique()))}
    pos_corpus = {k: v for k, v in zip(data['pos'].unique(), range(1, data['pos'].nunique())) if v != 'stopper'}
    pos_corpus['stopper'] = 9
    data['word'] = data.word.apply(lambda x: corpus[x])
    data['pos'] = data.pos.apply(lambda x: pos_corpus[x])
    del pos_corpus['stopper']
    return corpus, pos_corpus, data

def determine_max_length(data, idx):
    array = [0] + idx + [data.shape[0]]
    start = 0
    max_num = 0
    for i in array:
        val = i - start
        if val > max_num:
            max_num = val
        start = i
    return max_num

def create_sentence_vectors(data):
    idx = data.loc[data['pos'] == 9, :].index.values.tolist()
    words = np.empty([122, ])
    pos = np.empty([122, ])
    start = 0
    max_length = determine_max_length(data, idx)
    for i in idx:
        _words = data.loc[start:(i-1), 'word'].values
        _pos = data.loc[start:(i-1), 'pos'].values
        start = i + 1
        words = np.vstack((words, np.hstack((_words, np.zeros(max_length - len(_words))))))
        pos = np.vstack((pos, np.hstack((_pos, np.zeros(max_length - len(_pos))))))
    return words, pos, idx

In [40]:
corpus, pos_corpus, data = preprocess_dataframe(raw)
words, pos, idx = create_sentence_vectors(data)
max_length = determine_max_length(data, idx)
num_outputs = len(pos_corpus)
num_inputs = len(corpus)


encoder_input_data = np.zeros(
    (len(words), max_length, num_inputs) + 1),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(pos), max_length, num_outputs + 1),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(pos), max_length, num_outputs + 1),
    dtype='float32')

try:
    for i, (word, tag) in enumerate(zip(words, pos)):
        for t, char in enumerate(word):
            encoder_input_data[i, t, int(char)] = 1.
        for t, char in enumerate(tag):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, int(char)] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, int(char)] = 1.
except:
    import pdb; pdb.post_mortem()
    
    

In [None]:
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.

encoder_inputs = Input(shape=(max_length, num_inputs + 1))
encoder = GRU(latent_dim, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)

decoder_inputs = Input(shape=(max_length, num_outputs + 1))
decoder_gru = GRU(latent_dim, return_sequences=True)
decoder_outputs = decoder_gru(decoder_inputs, initial_state=state_h)
decoder_dense = Dense(num_outputs + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)




Train on 7062 samples, validate on 1766 samples
Epoch 1/10

In [48]:
model.summary()




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 17258)  0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 4)      0                                            
__________________________________________________________________________________________________
gru_3 (GRU)                     [(None, 256), (None, 13451520    input_5[0][0]                    
__________________________________________________________________________________________________
gru_4 (GRU)                     (None, None, 256)    200448      input_6[0][0]                    
                                                                 gru_3[0][1]                      
__________

In [34]:
len(corpus)




17258

In [55]:
num_tokens

122

In [66]:
batch_size = 64  # Batch size for training.
epochs = 5  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
max_length = determine_max_length(data)

encoder_inputs = Input(shape=(None, num_tokens))
encoder = GRU(latent_dim, return_state=True)
encoder_outputs, state_h = encoder(pos)

decoder_inputs = Input(shape=(None, num_tokens))
decoder_gru = GRU(latent_dim, return_sequences=True)
decoder_outputs = decoder_gru(pos, initial_state=state_h)
decoder_dense = Dense(num_tokens, activation='softmax')
decoder_outputs = decoder_dense(pos)
model = Model([words, pos], pos)

# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# model.fit(
#     words,
#     pos,
#     batch_size=batch_size,
#     epochs=epochs,
#     validation_split=0.2
# )

ValueError: Layer gru_12 was called with an input that isn't a symbolic tensor. Received type: <class 'numpy.ndarray'>. Full input: [array([[1., 2., 2., ..., 0., 0., 0.],
       [1., 2., 2., ..., 0., 0., 0.],
       [4., 2., 2., ..., 0., 0., 0.],
       ...,
       [4., 4., 4., ..., 0., 0., 0.],
       [2., 4., 4., ..., 0., 0., 0.],
       [4., 3., 3., ..., 0., 0., 0.]])]. All inputs to the layer should be tensors.

In [71]:
import tensorflow as tf

encoder(tf.convert_to_tensor(words))

ValueError: Input 0 is incompatible with layer gru_12: expected ndim=3, found ndim=2

In [46]:
pos[3]

array([4., 1., 3., 3., 1., 2., 1., 3., 3., 3., 2., 2., 1., 4., 1., 2., 3.,
       2., 3., 1., 1., 2., 4., 4., 4., 1., 3., 4., 9., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])