In [7]:
cd /Users/nbeshouri/Documents/Projects/Metis/Project\ 4

/Users/nbeshouri/Documents/Projects/Metis/Project 4


In [8]:
from hwmf import utils, models

In [32]:
from keras.layers import GRU, Input, Dense, TimeDistributed, RepeatVector, Bidirectional
from keras.models import Model
from keras.layers import Activation
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.layers.embeddings import Embedding

In [5]:
def get_seq2seq_models(english_vocab_size, french_vocab_size, cell_state_size=128, 
                       embedding_size = 64, num_layers=2, use_encoder_embeddings=True, 
                       use_decoder_embeddings=True):
    """
    Build and return three models used for sequence-to-sequence translation.
    
    These models roughly implement the system described by
    [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215).
    
    Args:
        english_vocab_size (int): The number of unique English words in the dataset
            including any metatokens like "<PAD>".
        french_vocab_size (int): The number of unique French words in the dataset 
            including any metatokens like "<PAD>".
        cell_state_size (int): The number of dimensions in each cell's state.
        embedding_size (int): The output size the embedding layers.
        num_layers (Optional[int]): The number of layers of GRU cells used by the
            encoder and decoder. Defaults to 2.
        use_encoder_embeddings (Optional[bool]): Whether or not use an embedding layer
            on the model's encoder. Defaults to True.
        use_decoder_embeddings (Optional[bool]): Whether or not use an embedding layer
            on the model's decoder. This makes the model train more slowly but
            dramatically increases validation accuracy.
    
    Returns: 
        A tuple of three Keras models which share parameters: 
            
            1. train_model: This model is used to train all the shared parameters. 
               It has two inputs:
               
                   1. x_encoder_train: This is just the training data, and 
                      should have shape `(num_sequences, input_sequence_len)` 
                      when using embeddings.
                   2. x_decoder_train: This is a version of `y_train` that has
                      been shifted one time-step to the right, and had the
                      "<START>" token inserted at the front. It should have
                      the same shape as `x_encoder_train`.
                      
               This model's output is only used by the optimizer—use the other
               two models for actual prediction.
            
            2. pred_encoder_model: This model is used during prediction to
               encode an English sentence for decoding. It takes a single
               input with shape `(num_sequences, input_sequence_length)`  
               assuming embeddings are used and returns a list of outputs, 
               one for each GRU layer, each with the shape `(num_sequences, 
               cell_state_size)`. 
            
            3. pred_decoder_model: This model is essentially a language model
               which, given the previously translated word and the current state 
               for each of its GRU layers, outputs the probabilities for the
               next word in the translation. Used this way, the first element in
               its input list should have shape `(num_sequences, 1)`, and should 
               be followed `num_layers` cell states, each with shape `(num_sequences, 
               cell_state_size)`. You could also feed in a whole translation and
               use the generated probabilities to calculate its total likelihood.     
    
    """
    # Setup encoder input.
    if use_encoder_embeddings:
        encoder_input = Input(shape=(None,), name='encoder_input')
        encoder_embedding_layer = Embedding(input_dim=english_vocab_size, 
            output_dim=embedding_size, mask_zero=True, name='encoder_embedding')
        embedded_encoder_input = encoder_embedding_layer(encoder_input)
    else:
        encoder_input = Input(shape=(None, english_vocab_size), name='encoder_input')
    encoder_gru_layers = [GRU(cell_state_size, return_sequences=True, return_state=True, 
        name=f'encoder_layer_{layer_num + 1}') for layer_num in range(num_layers - 1)]
    encoder_gru_layers.append(GRU(cell_state_size, return_state=True, 
                                  name=f'encoder_layer_{num_layers}'))
        
    # Setup decoder input.
    if use_decoder_embeddings:
        decoder_input = Input(shape=(None,), name='decoder_input')
        decoder_embedding_layer = Embedding(input_dim=french_vocab_size, 
            output_dim=embedding_size, mask_zero=True, name='decoder_embedding')
        embedded_decoder_input = decoder_embedding_layer(decoder_input)
    else:
        decoder_input = Input(shape=(None, french_vocab_size), name='decoder_input')
    
    decoder_gru_layers = [GRU(cell_state_size, return_sequences=True, return_state=True, 
        name=f'decoder_layer_{layer_num + 1}') for layer_num in range(num_layers)]
    
    # Setup final dense layer.
    dense = Dense(french_vocab_size, activation='softmax')
    dense_distrib = TimeDistributed(dense)
    
    # Connect encoder and decoder layers.
    temp_encoder_input = embedded_encoder_input if use_encoder_embeddings else encoder_input
    temp_decoder_input = embedded_decoder_input if use_decoder_embeddings else decoder_input
    encoder_states = []
    for encoder_layer, decoder_layer in zip(encoder_gru_layers, decoder_gru_layers):
        temp_encoder_output, temp_encoder_state = encoder_layer(temp_encoder_input)
        temp_encoder_input = temp_encoder_output
        temp_decoder_output, temp_decoder_state = decoder_layer(temp_decoder_input, 
            initial_state=temp_encoder_state)
        temp_decoder_input = temp_decoder_output
        encoder_states.append(temp_encoder_state)
    
    # Build the training model.
    output = dense_distrib(temp_decoder_output)
    train_model = Model([encoder_input, decoder_input], output)
    train_model.compile(loss=sparse_categorical_crossentropy,
                        optimizer='adam', metrics=['accuracy'])
    
    # Build prediction encoder model.
    pred_encoder_model = Model(encoder_input, encoder_states)
    
    
    # Build prediction decoder model.
    pred_decoder_state_inputs = [Input(shape=(cell_state_size,), 
        name=f'decoder_layer_{layer_num + 1}_state_input') for layer_num in range(num_layers)]
    
    pred_decoder_states = []
    temp_pred_decoder_input = embedded_decoder_input if use_decoder_embeddings else decoder_input   
    for decoder_layer, decoder_state_input in zip(decoder_gru_layers, pred_decoder_state_inputs):
        temp_pred_decoder_output, temp_pred_decoder_state = decoder_layer(
            temp_pred_decoder_input, initial_state=decoder_state_input)
        temp_pred_decoder_input = temp_pred_decoder_output
        pred_decoder_states.append(temp_pred_decoder_state)
    
    pred_word_probs = dense(temp_pred_decoder_output)

    pred_decoder_model = Model([decoder_input, *pred_decoder_state_inputs], 
                               [pred_word_probs, *pred_decoder_states])
                               
    return train_model, pred_encoder_model, pred_decoder_model

In [6]:
train_model, pred_encoder_model, pred_decoder_model = get_seq2seq_models(200, 200)

In [10]:
big_string = """Due to his banishment, he is hot-tempered, impatient and at times also depressive. He is shown to be caring and thoughtful, though his judgement was heavily clouded by his jealousy towards Azula. He has a close bond with his mother Ursa and his Uncle Iroh. He strives to regain honor and attention in his father's eyes by trying to capture the Avatar in his name. But after returning to the Fire Nation from his years of banishment, he realized his father's ill feelings towards him couldn't be fixed even by capturing the Avatar. Zuko decided that trying to regain honour, as a substitute for love, was worthless and a waste of time. He displayed aggression towards Aang when training him in fire-bending, because of the fact that it was imperitive Aang learn it before Sozin's Comet arrived at the end of the summer, though he was still being caring. He is shown to have a sarcastic side on several occasions. Due to his scar, he also has an inferiority complex about his appearance, stating angrily that: "Normal teenagers worry about bad skin! I don't have that luxury! My father decided to teach me a permanent lesson -- on my face!"""

In [12]:
texts = [big_string]
word_to_vec, word_to_id, embedding_matrix = models.get_embeddings(texts)

In [15]:
X = models.get_nn_X(texts, word_to_id)
y = X

In [19]:
train_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 64)     6400        encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, None, 64)     6400        decoder_input[0][0]              
__________________________________________________________________________________________________
encoder_la

In [23]:
X

array([[ 14,  53,  91,  32,  21,  17,  36,  10,  85,  90,  82, 100,  23,
         97,  17,  72,  53,  61, 121,  85,   1,  67,  91,  30,  15, 116,
         75,  59,  91,  87]])

In [25]:
X2 = np.array([[0, 14,  53,  91,  32,  21,  17,  36,  10,  85,  90,  82, 100,  23,
         97,  17,  72,  53,  61, 121,  85,   1,  67,  91,  30,  15, 116,
         75,  59,  91]])

In [27]:
train_model.fit([X, X2], X, batch_size=1, epochs=100)

ValueError: Error when checking target: expected time_distributed_1 to have 3 dimensions, but got array with shape (1, 30)

In [36]:
max_sequence_length = 30
recur_size = 128

In [37]:
inputs = Input(shape=(max_sequence_length,))
embeddings = Embedding(embedding_matrix.shape[0],
                       embedding_matrix.shape[1],
                       weights=[embedding_matrix],
                       trainable=True)(inputs)

encoded = Bidirectional(GRU(recur_size))(embeddings)

RepeatVector(max_sequence_length)(encoded)

decoded = Bidirectional(GRU(recur_size, return_sequences=True))(embeddings)

outputs = TimeDistributed(Dense(128))(decoded)

# outputs = Dense(num_classes, activation='softmax')(dense)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 30, 200)           25800     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 30, 256)           252672    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 30, 128)           32896     
Total params: 311,368
Trainable params: 311,368
Non-trainable params: 0
_________________________________________________________________
None
