In [9]:
import numpy as np
import tensorflow as tf
from preprocess import play_midi, parse_file,get_data_from_dir


Contains special words for the dictionary

In [10]:
special_words = ["<PAD>", "<GO>", "<END>"]

## Class For Dictionary
The class contains the data and the helper functions for the dictionary to train seq2seq model.
Most important functions are to mapping a list of words(a sentence) to a list of corresponding integer(indeces of words). 

In [11]:
class Seq2Seq_Dictionary:
    def __init__(self, sentences ):
        self.word2index_map = dict()
        self.index2word_map = dict()
        self.vocab_size = 0
        self.init_register(sentences)
        
    # Initiates word2index_map and index2word_map
    # Also extracts the max number of words in the sentences and saves it 
    def init_register(self,sentences):
        global special_words
        current_index = 0
        ## save the maximum length among the sentences. 
        self.max_length = max([len(sentence) for sentence in sentences])+2
        ### map special words, initially the mappings are empty.
        for word in special_words:
            self.word2index_map[word] = current_index
            self.index2word_map[current_index] = word
            current_index+=1
        
        s = set([item for sublist in sentences for item in sublist])
        self.word2index_map.update({e:i+current_index for i,e in enumerate(s)})
        self.index2word_map.update({v:k for k,v in self.word2index_map.items()})
        self.vocab_size = len(self.index2word_map)
    
    ## Returns the index of the word in the dictionary. It is assumed that the word
    ## will be always in dictionary.
    def get_index(self, word):
        return self.word2index_map[word]
    
    ## Maps a sentence, which is a list of words, to the corresponding list of integers.
    ## Each word is looked up from the map of the dictionary, and as in get_index method,
    ## it is assumed that the word will always be found in the dictionary
    def map_sentence(self, sentence):
        return [self.get_index(i) for i in sentence if i in self.word2index_map]
    
    ## Returns the word by its index in dictionary.
    def get_word(self, index):
        return self.index2word_map[word]
    
    ## Pads the list of words to <PAD> at the end of the list of words in sentence
    def pad_sentence(self, sentence):
        return ["<PAD>"] * (self.max_length - len(sentence)-2)+sentence 
    
    ## Adds <GO> and <END> to the start and end of the sentence
    def add_start_end_tokens(self, sentence):
        return ["<GO>"] + sentence + ["<END>"]
    
    ## Transforms the sentence in a format suitable for the Neural Network
    def transform_sentence(self, sentence):
        s = self.pad_sentence(sentence)
        s = self.add_start_end_tokens(s)
        s = self.map_sentence(s)
        return s
    
    ### Reverse operation of map_sentence. Gets the sentence, list of integers, as input and
    ### maps each entry from index to word
    #def decode_indeces(self,sentence_with_index):
    #    sentence_list = list()

#### Create the dictionary

In [12]:
data, targets = get_data_from_dir("test_midi_small/")
data_dict = Seq2Seq_Dictionary(data)
target_dict = Seq2Seq_Dictionary(targets)

For file test_midi_small/All That She Wants.mid extracted:
Text: [b'She ', b'leads ', b'a ', b'lone', b'ly ', b'life ', b'\r', b'Ooh ', b'She ', b'leads ', b'a ', b'lone', b'ly ', b'life ', b'\r', b'When ', b'she ', b'woke ', b'up ', b'late ', b'in ', b'the ', b'mor', b'ning ', b'\r', b'Light ', b'and ', b'the ', b'day ', b'had ', b'just ', b'be', b'gu', b'n ', b'\r', b'She ', b'o', b'pened ', b'up ', b'her ', b'eyes ', b'and_thought ', b'\r', b"O' ", b'what ', b'a ', b'morn', b'ing ', b'\r', b"It's ", b'not ', b'a ', b'day ', b'for ', b'wor', b'k ', b'\r', b"It's ", b'a ', b'day ', b'for ', b'cat', b'ching ', b'tan ', b'\r', b'Just ', b'la', b'ying ', b'on ', b'the ', b'beach ', b'and ', b'ha', b'ving ', b'fun ', b'\r', b"She's ", b'go', b'ing ', b'to ', b'get_you ', b'\r', b'All ', b'that ', b'she ', b'wants ', b'is ', b'a', b'no', b'ther ', b'ba', b'by ', b'\r', b"She's ", b'gone ', b'to', b'mo', b'rrow ', b'boy ', b'\r', b'All ', b'that ', b'she ', b'wants ', b'is ', b'a', b'no', b

Text: [b'I ', b'try ', b'and ', b'feel ', b'the ', b'sun', b'shine ', b'\r', b'You ', b'bring ', b'the ', b'rain ', b'\r', b'You ', b'try ', b'and ', b'hold ', b'me ', b'down ', b'\r', b'With ', b'your ', b'com', b'plaints ', b'\r', b'You ', b'cry ', b'and ', b'moan ', b'and ', b'com', b'plain ', b'\r', b'You ', b'whine ', b'an ', b'tear ', b'\r', b'Up ', b'to ', b'my ', b'neck ', b'in ', b'sor', b'row ', b'\r', b'The ', b'touch ', b'you ', b'bring ', b'\r', b'You ', b'just ', b"don't ", b'step ', b'in', b'side ', b'to ', b'\r', b'(to) ', b'Four', b'teen ', b'ye', b'ars ', b'\r', b'So ', b'hard ', b'to ', b'keep ', b'my ', b'o', b'wn ', b'head... ', b'\r', b"That's ", b'what ', b'I ', b'say ', b'\r', b'You ', b'kno', b'w... ', b"I've ", b'been ', b'the ', b'beg', b'gar... ', b'\r', b"I've ", b'played ', b'the ', b'thief ', b'\r', b'I ', b'was ', b'the ', b'dog... ', b'they ', b'all ', b'tried ', b'to ', b'beat ', b'\r', b'But ', b"it's ", b'bee', b'n ', b'\r', b'Four', b'teen ', b'year

In [13]:
## Transform the data and the targets
transformed_data = [data_dict.transform_sentence(i) for i in data]
transformed_targets = [target_dict.transform_sentence(i) for i in targets]

## Sanity check for the lengths of the data and the targets
assert np.mean([len(x) for x in transformed_data])==data_dict.max_length
assert np.mean([len(x) for x in transformed_targets])==target_dict.max_length
assert len(transformed_data) == len(transformed_targets)

len(data_dict.index2word_map)

274

In [14]:
print(f"Size of data vocabulary: {data_dict.vocab_size}")
print(f"Size of targets vocabulary: {target_dict.vocab_size}")

print(f"Max. Length of the data: {data_dict.max_length}")
print(f"Max. Length of the target: {target_dict.max_length}")
print(f"Sample data: {data[2]}")
print(f"Corresponding targets: {targets[5]}")
print(f"Sample transformed data: {transformed_data[2]}")
print(f"Corresponding transformed targets: {transformed_targets[2]}")

Size of data vocabulary: 274
Size of targets vocabulary: 107
Max. Length of the data: 403
Max. Length of the target: 2925


IndexError: list index out of range

## Model Creation
Tensorflow initialization

In [None]:
sess.close()
tf.reset_default_graph()
#sess = tf.InteractiveSession()


### General Model Variables 

In [None]:
PAD = 0
START = 1
END = 2

embedding_size= 256
hidden_units = 128
keep_prob=0.5 # Dropout parameter
batch_size = 64
max_seq_length = data_dict.max_length
vocab_size = len(data_dict.index2word_map)
learning_rate = 5e-4

#### Encoder Inputs

In [None]:
_encoder_inputs = tf.placeholder(shape=(batch_size, max_seq_length),
                                 dtype=tf.int32, name='encoder_inputs')
_encoder_seq_len = tf.placeholder(shape=(batch_size),
                                 dtype=tf.int32, name='encoder_seq_lens')

Encoder part is created here. In the architecture, a bidirectional GRU cell is used after the embedding.

In [None]:
tf.reset_default_graph()
_encoder_inputs = tf.placeholder(shape=(batch_size, max_seq_length),
                                 dtype=tf.int32, name='encoder_inputs')
_encoder_seq_len = tf.placeholder(shape=(batch_size),
                                 dtype=tf.int32, name='encoder_seq_lens')
_decoder_inputs = tf.placeholder(shape=(batch_size,max_seq_length),
                                 dtype=tf.int32, name='decoder_inputs')
### remove before here
with tf.variable_scope("encoder") as encoder_sc:
    ## embeddings
    enc_embed_var = tf.Variable(
        tf.random_uniform([vocab_size,
                           embedding_size],
                          -1.0, 1.0), name='embedding')
    
    enc_embed = tf.nn.embedding_lookup(enc_embed_var, _encoder_inputs)
    
    # Forward direction cell
    enc_gru_fw = tf.nn.rnn_cell.GRUCell(hidden_units)
    # Backward direction cell
    enc_gru_bw = tf.nn.rnn_cell.GRUCell(hidden_units)
    
    enc_dropout_fw = tf.contrib.rnn.DropoutWrapper(enc_gru_fw, input_keep_prob=keep_prob,
                                                   output_keep_prob=keep_prob)

    enc_dropout_bw = tf.contrib.rnn.DropoutWrapper(enc_gru_bw, input_keep_prob=keep_prob,
                                                   output_keep_prob=keep_prob)

    
    ## here the state variable contains only the last state information of the cells
    enc_rnn_outputs,enc_rnn_state=tf.nn.bidirectional_dynamic_rnn(enc_dropout_fw,
                                                          enc_dropout_bw, 
                                                          enc_embed,
                                                          sequence_length=_encoder_seq_len,
                                                          dtype=tf.float32)
    ## Get forward and backward last states and outputs of the GRU
    enc_rnn_outputs_fw,enc_rnn_outputs_bw  = enc_rnn_outputs
    enc_rnn_fw_state,enc_rnn_bw_state  = enc_rnn_state
    
    ## concat states and outputs
    _enc_last_state = tf.concat((enc_rnn_bw_state, enc_rnn_fw_state),1)
    _enc_output = tf.concat((enc_rnn_outputs_bw,enc_rnn_outputs_fw),2)

In [None]:
print(_enc_last_state.get_shape())
print(_enc_output.get_shape())


Decoder part is created here. Because bidirectional GRU  is used in the encoder part the state vector is twice size of an GRU cell with same number of hidden units. So, after concatanating the last states of GRUs, here the hidden units of GRU should be doubled.

### Decoder  With While Loop
We are using a while loop structure because each resulting hidden state of the GRU in the decoder, will be an input to the network to calculate scores of the next word in the same sentence.

Following is the condition for the while loop. From the first word at each sentence, iteration should go until the last word.

In [None]:

def decoder_condition(t, *args):
    return t<max_seq_length

Decoder as a function to be called from the body of the while_loop. Note that, in order to reuse the network after each word, first we need to initialize it then set the reuse to True.

In [None]:
#tf.reset_default_graph()
#_enc_last_state = tf.placeholder(shape=(batch_size, 2*hidden_units),
#                                 dtype=tf.float32, name='decoder_input_enc_last_state')
#_enc_output = tf.placeholder(shape=(batch_size,max_seq_length ,2*hidden_units),
#                                 dtype=tf.float32, name='decoder_input_enc_last_state')
#_decoder_inputs = tf.placeholder(shape=(batch_size),
#                                 dtype=tf.int32, name='decoder_inputs')

def decoder(_decoder_inputs,_hidden_state,reuse=None):
    with tf.variable_scope("decoder",reuse=reuse) as decoder_sc:
        ## Luong's multiplicative score --> score = _hidden_state.T * W * _enc_output

        ### First the W*_enc_output part is handled. It is straightforward with a dense layer, 
        ### and its output size should be hidden_size*2, because we have a bidirectional rnn 
        ### in the encoder. Output shape should be (batch_size, max_len, 2*hidden_size)
        ### because later it will be multiplied with (batch_size,2*hidden_size) (which could be thought
        ### as batch_size, 2*hidden_size, 1) to get the score.
        w_times_enc_output = tf.layers.dense(_enc_output, hidden_units*2)
        print("shape of w_times_enc_output:",w_times_enc_output.get_shape())

        ### First hidden state is taken from the encoder's GRUs last hidden state. So the
        ### shape of it is (batch_size, 2*hidden_size). For each input sentence, there is one
        ### hidden state.
        ### _hidden_state's size is (batch_size, 2*hidden_size) one can think of it as 
        ### (batch_size, 1,2*hidden_size). Semantically, there is only one hidden state vector
        ### for each batch item(iteration).To transpose it, as the formula of Luong's suggests,
        ### we can just expand (batch_size, 2*hidden_size) to (batch_size, 2*hidden_size,1), 
        ### expanding in the 2.nd dimension.
        hidden_state_tr = tf.expand_dims(_hidden_state,2)
        print("shape of enc_last_state_tr:",hidden_state_tr.get_shape())

        ### w_times_enc_output = (batch_size, max_len, 2*hidden_size)
        ### enc_last_state_tr = (batch_size, 2*hidden_size,1)
        ### resulting score = (batch_size, max_len,1)
        score =  tf.matmul(w_times_enc_output,hidden_state_tr)
        print("shape of score:",score.get_shape())

        ### Now the shape of score (batch_size, max_len,1). We have a score for each of the 
        ### input word in a bacth. To normalize it, now they are put in a softmax, and 
        ### the normalization should be within a batch, so the axis to apply softmax is
        ### 1.st one, since 0 is used for batches.
        ### Attention weights(attention_w) has same shape with score, which is (batch_size, max_len,1)
        attention_w = tf.nn.softmax(score,1)

        ### attention_w (batch_size, max_len,1),   _enc_output (batch_size, max_len,2*hidden_size).
        ### Multiplication operator supports broadcasting, so that this multiplication does not produce
        ### an error. attention_w is broadcasted to be multiplied with each hidden unit of _enc_output,.
        ### which means multiplying each output of the hidden units with the attention weight of the
        ### associated word.
        ### Resulting context_vec is in shape of (batch_size, max_len, 2*hidden_size)
        context_vec = attention_w * _enc_output

        ### To create a context vector for each sentence in the batch, now we are summing
        ### up along the dimension of the max_len(along words in a sentence) 
        ### so that we are left with size (batch_size, 2*hidden_size).
        context_vec = tf.reduce_sum(context_vec, axis=1)
        print("shape of context_vec:",context_vec.get_shape())

        ### Input to the decoder is also put through a embedding layer, since they are
        ### target sentences.
        embed_var = lambda: tf.random_uniform([vocab_size,embedding_size],-1.0, 1.0)
        dec_embed_var = tf.Variable(embed_var ,name='decoder_embedding')

        ### Size of the embedded input-> (batch_size, 1, embedding_size)
        dec_embed = tf.nn.embedding_lookup(dec_embed_var, tf.expand_dims(_decoder_inputs,1))

        print("shape of the decoder embedding:",dec_embed.get_shape())

        ### To make the 1.st dimension matching with the embedded input, now the context vector 
        ### is expanded in the 1.st dimension. resulting size is (batch_size, 1, 2*hidden_size)
        context_vec = tf.expand_dims(context_vec, 1)

        ### Concatanate along the second dimension, so the resulting size is
        ### (batch_size, 1, 2*hidden_size + hidden_dim)
        dec_before_gru = tf.concat([context_vec, dec_embed], axis=2)

        ### Since we will be feeding the decoder one input at a time, the sequence length
        ### would be either 0 or 1 depending on the current input of each sentence.
        ### So if the current input is not <PAD>, then the seq len is 1, if it is <PAD> then 
        ### it is just a padding, the seq len is 0.
        all_pads = [data_dict.get_index("<PAD>")]*batch_size
        ones = np.ones((batch_size))
        zeros = np.zeros((batch_size))
        dec_seq_len = tf.cast(tf.where(_decoder_inputs == all_pads, zeros, ones),
                              dtype=tf.float32)

        ### Now the input is ready for the GRU.
        dec_gru = tf.nn.rnn_cell.GRUCell(2*hidden_units)

        dec_dropout = tf.contrib.rnn.DropoutWrapper(dec_gru, input_keep_prob=keep_prob,
                                                       output_keep_prob=keep_prob)

        ### dec_rnn_outputs has shape (batch_size, 1, 2*hidden_size)
        ### dec_rnn_state has shape (batch_size, 2*hidden_size)
        dec_rnn_outputs,dec_rnn_state=tf.nn.dynamic_rnn(cell=dec_dropout, inputs=dec_before_gru, 
                                                        initial_state=_hidden_state,
                                                        sequence_length=dec_seq_len)
        ### To make predictions based on the output of the rnn, now we are reshaping the 
        ### the output to the shape of (batch_size, 2*hidden_size)
        dec_rnn_outputs = tf.squeeze(dec_rnn_outputs)

        ### predictions has the shape of (batch_size, vocab_size). This means we are predicting
        ### only the next word for each sentence. For each sentence, there is a vector of
        ### shape vocab_size which contains the likelihood of the corresponding vocabulary
        ### element for the next word in the sentence.
        preds = tf.layers.dense(dec_rnn_outputs, vocab_size)
    
    return preds, dec_rnn_state,dec_seq_len  #,dec_embed_var.read_value()

In [None]:
#with tf.variable_scope("pred_layer") as pred_layer_sc:  

### Prepare the decoder inputs. 
To reduce the memory usage during the backpropagation, we are putting the each word in each sentences in a batch into a TensorArray.

The size of the decoder inputs is the (batch_size, max_notes_len), aproximately the target notes of the current input batch. First we are transposing it to word major order, so that the [i,] indexing will return the i.th note of the all note sequences in the batch. Then we are going to unstack it, hence we will get a tensorarray of size max_notes_len, each item containing batch_size notes.

In [None]:
decoder_inputs_tr = tf.transpose(_decoder_inputs)
decoder_input_arr = tf.TensorArray(dtype=tf.int32, size=max_seq_length)
decoder_input_arr = decoder_input_arr.unstack(decoder_inputs_tr)

## Run Decoder

General variables to run the decoder loop. 

Iteration starts from 1, because we are going to call the decoder to initialize it at the beginning. 

init_outputs stores the output predictions of each iteration.

In [None]:
init_i = tf.constant(1, dtype=tf.int32)
init_outputs = tf.TensorArray(dtype=tf.float32,size=max_seq_length)
init_seq_len = tf.TensorArray(dtype=tf.init32,size=max_seq_length)
#init_embed_vals = tf.random_uniform([vocab_size,embedding_size],-1.0, 1.0)

Initilize the decoder to be able to "reuse" it, note that reuse is None as default. Initial hidden state is from the encoder and first decoder input is the 0.th element of the decoder_input_arr. 

In [None]:
init_preds,init_hidden_state,temp_seq_len = decoder(decoder_input_arr.read(0), _enc_last_state)
                                       #init_embed_vals)
init_outputs = init_outputs.write(0, init_preds)
init_seq_len = init_seq_len.write(0, temp_seq_len)

Run the decoder with the while loop. We need 

In [None]:
def decoder_body(iteration,outputs,body_hidden_state,seq_len):
    temp_preds,temp_hid_state,temp_seq_len = decoder(
                                    _decoder_inputs=decoder_input_arr.read(iteration), 
                                    _hidden_state=body_hidden_state,
                                    #_embedding_var = embed_val,
                                    reuse=True)
    outputs = outputs.write(iteration, temp_preds)
    seq_len = seq_len.write(iteration, temp_seq_len)
    return iteration+1, outputs, temp_hid_state       ##,temp_embed_vals

Finally run the while loop. We dont need the latest hidden state and the iteration count, the only thing needed is the predictions of the decoder.

In [None]:
_, dec_preds, _ = tf.while_loop(decoder_condition, decoder_body, 
                                [init_i, init_outputs, init_hidden_state,init_seq_len])

Now, dec_preds contains the predictions for each word. So each item in the tensorarray contains (batch_size, vocab_size) shaped tensors. 

Now we are going to create a one tensor whose shape is (batch, max_len,vocab_size). To achieve it, we are going to stack the dec_preds, which results in (max_notes_len, batch_size, vocab_size). Then transpose to get (batch_size,max_notes_len,vocab_size). And the same is done for the seq_len.

In [None]:
preds = tf.transpose(dec_preds.stack(), [1,0,2])
all_seq_len = tf.transpose(seq_len.stack(), [1,0,2])

### Optimizer and the Loss function

Define the mask for the backpropagation. If the target is <PAD> then don't backpropagate. To do it, first get all <PAD> strings as 1, and then subtract from 1 to make them all 0 s.

In [None]:
def batch_loss(targets, preds):
    mask=1-np.equal(targets, data_dict.get_index("<PAD>"))
    ### If the input word is <PAD>, then there is no need for optimization for that input.
    cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=preds, labels=_targets) * mask

    ### mean of the cross entropy is the loss of this batch
    loss = tf.reduce_mean(cross_ent)
    return loss

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_step = optimizer.minimize(cross_entropy)