In [None]:
import numpy as np
import tensorflow as tf
from preprocess import play_midi, parse_file,get_data_from_dir


Contains special words for the dictionary

In [None]:
special_words = ["<PAD>", "<GO>", "<END>"]

## Class For Dictionary
The class contains the data and the helper functions for the dictionary to train seq2seq model.
Most important functions are to mapping a list of words(a sentence) to a list of corresponding integer(indeces of words). 

In [None]:
class Seq2Seq_Dictionary:
    def __init__(self, sentences ):
        self.word2index_map = dict()
        self.index2word_map = list()
        self.init_register(sentences)
        
    # Initiates word2index_map and index2word_map
    # Also extracts the max number of words in the sentences and saves it 
    def init_register(self,sentences):
        global special_words
        current_index = 0
        ## save the maximum length among the sentences. 
        self.max_length = max([len(sentence) for sentence in sentences])
        ### map special words, initially the mappings are empty.
        for word in special_words:
            self.word2index_map[word] = current_index
            self.index2word_map.append(word)
            current_index+=1
        
        ## iterate input sentences
        for sentence in sentences:
            
            #iterate the words in a sentence    
            for word in sentence:
                # if the word is not in the map, add it to both maps.
                # to count the index current_index is used.
                if word not in self.word2index_map:
                    self.word2index_map[word] = current_index
                    self.index2word_map.append(word)
                    current_index+=1
    
    ## Returns the index of the word in the dictionary. It is assumed that the word
    ## will be always in dictionary.
    def get_index(self, word):
        return self.word2index_map[word]
    
    ## Maps a sentence, which is a list of words, to the corresponding list of integers.
    ## Each word is looked up from the map of the dictionary, and as in get_index method,
    ## it is assumed that the word will always be found in the dictionary
    def map_sentence(self, sentence):
        mapping = list()
        for word in sentence:
            mapping.append(self.get_index(word))
        return mapping
    
    ## Returns the word by its index in dictionary.
    #def get_word(self, index):
    #    return self.index2word_map[word]
    
    ### Reverse operation of map_sentence. Gets the sentence, list of integers, as input and
    ### maps each entry from index to word
    #def decode_indeces(self,sentence_with_index):
    #    sentence_list = list()
        

#### Create the dictionary

In [None]:
data_dict = Seq2Seq_Dictionary([["asdad","asdad","asdada"]])

## Model Creation
Tensorflow initialization

In [None]:
sess.close()
tf.reset_default_graph()
#sess = tf.InteractiveSession()


### General Model Variables 

In [None]:
PAD = 0
START = 1
END = 2

embedding_size= 256
hidden_units = 128
keep_prob=0.5 # Dropout parameter
batch_size = 64
max_seq_length = data_dict.max_length
vocab_size = len(data_dict.index2word_map)
learning_rate = 5e-4

#### Encoder Inputs

In [None]:
_encoder_inputs = tf.placeholder(shape=(batch_size, max_seq_length),
                                 dtype=tf.int32, name='encoder_inputs')
_encoder_seq_len = tf.placeholder(shape=(batch_size),
                                 dtype=tf.int32, name='encoder_seq_lens')

Encoder part is created here. In the architecture, a bidirectional GRU cell is used after the embedding.

In [None]:
tf.reset_default_graph()
_encoder_inputs = tf.placeholder(shape=(batch_size, max_seq_length),
                                 dtype=tf.int32, name='encoder_inputs')
_encoder_seq_len = tf.placeholder(shape=(batch_size),
                                 dtype=tf.int32, name='encoder_seq_lens')
### remove before here
with tf.variable_scope("encoder") as encoder_sc:
    ## embeddings
    enc_embed_var = tf.Variable(
        tf.random_uniform([vocab_size,
                           embedding_size],
                          -1.0, 1.0), name='embedding')
    
    enc_embed = tf.nn.embedding_lookup(enc_embed_var, _encoder_inputs)
    
    # Forward direction cell
    enc_gru_fw = tf.nn.rnn_cell.GRUCell(hidden_units)
    # Backward direction cell
    enc_gru_bw = tf.nn.rnn_cell.GRUCell(hidden_units)
    
    enc_dropout_fw = tf.contrib.rnn.DropoutWrapper(enc_gru_fw, input_keep_prob=keep_prob,
                                                   output_keep_prob=keep_prob)

    enc_dropout_bw = tf.contrib.rnn.DropoutWrapper(enc_gru_bw, input_keep_prob=keep_prob,
                                                   output_keep_prob=keep_prob)

    
    ## here the state variable contains only the last state information of the cells
    enc_rnn_outputs,enc_rnn_state=tf.nn.bidirectional_dynamic_rnn(enc_dropout_fw,
                                                          enc_dropout_bw, 
                                                          enc_embed,
                                                          sequence_length=_encoder_seq_len,
                                                          dtype=tf.float32)
    ## Get forward and backward last states and outputs of the GRU
    enc_rnn_outputs_fw,enc_rnn_outputs_bw  = enc_rnn_outputs
    enc_rnn_fw_state,enc_rnn_bw_state  = enc_rnn_state
    
    ## concat states and outputs
    enc_last_state = tf.concat((enc_rnn_bw_state, enc_rnn_fw_state),1)
    enc_output = tf.concat((enc_rnn_outputs_bw,enc_rnn_outputs_fw),2)

Decoder part is created here. Because bidirectional GRU  is used in the encoder part the state vector is twice size of an GRU cell with same number of hidden units. So, after concatanating the last states of GRUs, here the hidden units of GRU should be doubled.

In [None]:
print(enc_last_state.get_shape())
print(enc_output.get_shape())


### Decoder Inputs

In [None]:
_enc_last_state = tf.placeholder(shape=(batch_size, 2*hidden_units),
                                 dtype=tf.float32, name='decoder_input_enc_last_state')
_enc_output = tf.placeholder(shape=(batch_size,max_seq_length ,2*hidden_units),
                                 dtype=tf.float32, name='decoder_input_enc_last_state')
_decoder_inputs = tf.placeholder(shape=(batch_size),
                                 dtype=tf.int32, name='decoder_inputs')

In [None]:
tf.reset_default_graph()
_enc_last_state = tf.placeholder(shape=(batch_size, 2*hidden_units),
                                 dtype=tf.float32, name='decoder_input_enc_last_state')
_enc_output = tf.placeholder(shape=(batch_size,max_seq_length ,2*hidden_units),
                                 dtype=tf.float32, name='decoder_input_enc_last_state')
_decoder_inputs = tf.placeholder(shape=(batch_size),
                                 dtype=tf.int32, name='decoder_inputs')
with tf.variable_scope("decoder") as decoder_sc:
    ## Luong's multiplicative score --> score = hidden_state.T * W * _enc_output
    
    ### First the W*_enc_output part is handled. It is straightforward with a dense layer, 
    ### and its output size should be hidden_size*2, because we have a bidirectional rnn 
    ### in the encoder. Output shape should be (batch_size, max_len, 2*hidden_size)
    ### because later it will be multiplied with (batch_size,2*hidden_size) (which could be thought
    ### as batch_size, 2*hidden_size, 1) to get the score.
    w_times_enc_output = tf.layers.dense(_enc_output, hidden_units*2)
    print("shape of w_times_enc_output:",w_times_enc_output.get_shape())
    
    ### 
    ### enc_last_state's size is (batch_size, 2*hidden_size) one can think of it as 
    ### (batch_size, 1,2*hidden_size). Semantically, there is only one hidden state vector
    ### for each batch item(iteration).To transpose it, as the formula of Luong's suggests,
    ### we can just expand (batch_size, 2*hidden_size) to (batch_size, 2*hidden_size,1), 
    ### expanding in the 2.nd dimension.
    enc_last_state_tr = tf.expand_dims(_enc_last_state,2)
    print("shape of enc_last_state_tr:",enc_last_state_tr.get_shape())
    
    ### w_times_enc_output = (batch_size, max_len, 2*hidden_size)
    ### enc_last_state_tr = (batch_size, 2*hidden_size,1)
    ### resulting score = (batch_size, max_len,1)
    score =  tf.matmul(w_times_enc_output,enc_last_state_tr)
    print("shape of score:",score.get_shape())
    
    ### Now the shape of score (batch_size, max_len,1). We have a score for each of the 
    ### input word in a bacth. To normalize it, now they are put in a softmax, and 
    ### the normalization should be within a batch, so the axis to apply softmax is
    ### 1.st one, since 0 is used for batches.
    ### Attention weights(attention_w) has same shape with score, which is (batch_size, max_len,1)
    attention_w = tf.nn.softmax(score,1)
    
    ### attention_w (batch_size, max_len,1),   _enc_output (batch_size, max_len,2*hidden_size).
    ### Multiplication operator supports broadcasting, so that this multiplication does not produce
    ### an error. attention_w is broadcasted to be multiplied with each hidden unit of _enc_output,.
    ### which means multiplying each output of the hidden units with the attention weight of the
    ### associated word.
    ### Resulting context_vec is in shape of (batch_size, max_len, 2*hidden_size)
    context_vec = attention_w * _enc_output
    
    ### To create a context vector for each sentence in the batch, now we are summing
    ### up along the dimension of the max_len(along words in a sentence) 
    ### so that we are left with size (batch_size, 2*hidden_size).
    context_vec = tf.reduce_sum(context_vec, axis=1)
    print("shape of context_vec:",context_vec.get_shape())
    
    ### Input to the decoder is also put through a embedding layer, since they are
    ### target sentences.
    dec_embed_var = tf.Variable(
        tf.random_uniform([vocab_size,
                           embedding_size],
                          -1.0, 1.0), name='decoder_embedding')
    
    ### Size of the embedded input-> (batch_size, 1, embedding_size)
    dec_embed = tf.nn.embedding_lookup(dec_embed_var, tf.expand_dims(_decoder_inputs,1))
    
    print("shape of the decoder embedding:",dec_embed.get_shape())
    
    ### To make the 1.st dimension matching with the embedded input, now the context vector 
    ### is expanded in the 1.st dimension. resulting size is (batch_size, 1, 2*hidden_size)
    context_vec = tf.expand_dims(context_vec, 1)
    
    ### Concatanate along the second dimension, so the resulting size is
    ### (batch_size, 1, 2*hidden_size + hidden_dim)
    dec_before_gru = tf.concat([context_vec, dec_embed], axis=2)
    
    ### Since we will be feeding the decoder one input at a time, the sequence length
    ### would be either 0 or 1 depending on the current input of each sentence.
    ### So if the current input is not <PAD>, then the seq len is 1, if it is <PAD> then 
    ### it is just a padding, the seq len is 0.
    all_pads = [data_dict.get_index("<PAD>")]*batch_size
    ones = np.ones((batch_size))
    zeros = np.zeros((batch_size))
    dec_seq_len = tf.cast(tf.where(_decoder_inputs == all_pads, zeros, ones),dtype=tf.float32)
    
    ### Now the input is ready for the GRU.
    dec_gru = tf.nn.rnn_cell.GRUCell(2*hidden_units)
    
    dec_dropout = tf.contrib.rnn.DropoutWrapper(dec_gru, input_keep_prob=keep_prob,
                                                   output_keep_prob=keep_prob)
    
    ### dec_rnn_outputs has shape (batch_size, 1, 2*hidden_size)
    ### dec_rnn_state has shape (batch_size, 2*hidden_size)
    dec_rnn_outputs,dec_rnn_state=tf.nn.dynamic_rnn(cell=dec_dropout, inputs=dec_before_gru, 
                                                    initial_state=_enc_last_state,
                                                    sequence_length=dec_seq_len)
    ### To make predictions based on the output of the rnn, now we are reshaping the 
    ### the output to the shape of (batch_size, 2*hidden_size)
    dec_rnn_outputs = tf.squeeze(dec_rnn_outputs)

In [None]:
with tf.variable_scope("pred_layer") as pred_layer_sc:
    ### predictions has the shape of (batch_size, vocab_size). This means we are predicting
    ### only the next word for each sentence. For each sentence, there is a vector of
    ### shape vocab_size which contains the likelihood of the corresponding vocabulary
    ### element for the next word in the sentence.
    preds = tf.layers.dense(dec_rnn_outputs, vocab_size)

In [None]:
print(dec_rnn_outputs.get_shape())

### Optimizer and the Loss function

In [None]:
_targets = tf.placeholder(shape=(batch_size,vocab_size), 
                                 dtype=tf.int32, name='Targets')

### If the input word is <PAD>, then there is no need for optimization for that input.
cross_ent = tf.nn.softmax_cross_entropy_with_logits_v2(logits=preds, labels=_targets) * dec_seq_len

### mean of the cross entropy is the loss of this batch
loss = tf.reduce_mean(cross_ent)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

