In [None]:
import sys
print(sys.version_info)
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras
print('Keras version: {}'.format(keras.__version__))
import tensorflow as tf
print('TensorFlow version: {}'.format(tf.__version__))
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Problem: generate names

* Struggle to find a name for the variable? Let's see how you'll come up with a name for your son/daughter. Surely no human has expertize over what is a good child name, so let us train NN instead;
* The dataset contains ~8k earthling names from different cultures, all in latin transcript;
* Objective: learn a generative model over names.


In [None]:
import os
start_token = " "

with open("names") as f:
    names = f.read()[:-1].split('\n')
    names = [start_token+name for name in names]
    

In [None]:
print('n samples = {}'.format(len(names)))
for x in names[::1000]:
    print(x)

# Text processing

In [None]:
#all unique characters go here
tokens = #<all unique characters in the dataset>

tokens = list(tokens)

n_tokens = len(tokens)
print('n_tokens = ',n_tokens)


In [None]:
plt.title('Sequence length distribution')
plt.hist(list(map(len, names)), bins=25)

### Cast everything from symbols into identifiers

Tensorflow string manipulation is a bit tricky, so we'll work around it. 
We'll feed our recurrent neural network with ids of characters from our dictionary.

To create such dictionary, let's assign 

In [None]:
token_to_id = #<dictionary of symbol -> its identifier (index in tokens list)>



In [None]:
assert len(tokens) == len(token_to_id), "dictionaries must have same size"

for i in range(n_tokens):
    assert token_to_id[tokens[i]] == i, "token identifier must be it's position in tokens list"

print("Seems alright!")

In [None]:
def to_matrix(names, max_len=None, pad=token_to_id[' '], dtype='int32'):
    """Casts a list of names into rnn-digestable matrix"""
    
    max_len = max_len or max(map(len, names))
    names_ix = np.zeros([len(names), max_len], dtype) + pad

    for i in range(len(names)):
        name_ix = list(map(token_to_id.get,names[i]))
        names_ix[i, :len(name_ix)] = name_ix

    return names_ix.T

In [None]:
#Example: cast 4 random names to matrices, pad with zeros
print('\n'.join(names[::2000]))
print(to_matrix(names[::2000]).T)

In [None]:
to_matrix(names[::2000])

# Recurrent neural network

We can rewrite recurrent neural network as a consecutive application of dense layer to input $x_t$ and previous rnn state $h_t$. This is exactly what we're gonna do now.
<img src="./rnn.png" width=480>

Since we're training a language model, there should also be:
* An embedding layer that converts character id x_t to a vector.
* An output layer that predicts probabilities of next phoneme

In [None]:
from keras.layers import Concatenate, Dense, Embedding

rnn_num_units = 64
embedding_size = 16

#Let's create layers for our recurrent network
embed_x = #<an embedding layer that converts character ids into embeddings>
get_h_next = #<a dense layer that maps [x_t,h_t]->h_t+1>. 
get_probas = #<a dense layer that maps [h_t+1]->P(x_t+1|h_t+1). 

#Note: please set the correct activation to Dense layer

In [None]:
def rnn_one_step(x_t, h_t):
    """
    Recurrent neural network step that produces next state and output
    given prev input and previous state.
    We'll call this method repeatedly to produce the whole sequence.
    
    Follow inline isntructions to complete the function.
    """
    
    # Pay attention to the shapes, it may be messy!
    x_t_emb = #<convert character id into embedding>
    x_and_h = #<concatenate x embedding and previous h state>
    
    h_next =  #<compute next state given x_and_h>
    
    output_probas = #<get probabilities for language model P(x_next|h_next)>
    
    return output_probas,h_next

### RNN loop

Once rnn_one_step is ready, let's apply it in a loop over name characters to get predictions.

Let's assume that all names are at most length-16 for now, so we can simply iterate over them in a for loop.


In [None]:
MAX_LENGTH = 16

input_sequence = keras.backend.placeholder(shape=(MAX_LENGTH, None), dtype='int32')
batch_size = keras.backend.shape(input_sequence)[1]

predicted_probas = []
h_prev = keras.backend.zeros([batch_size, rnn_num_units]) #initial hidden state

for t in range(MAX_LENGTH):
    x_t = input_sequence[t]
    probas_next,h_next = rnn_one_step(x_t, h_prev)
    
    h_prev = h_next
    predicted_probas.append(probas_next)
    
predicted_probas = keras.backend.stack(predicted_probas)

## RNN: loss and gradients

Let's gather a matrix of predictions for $P(x_{next}|h)$ and the corresponding correct answers.

Our network can then be trained by minimizing crossentropy between predicted probabilities and those answers.

In [None]:
predictions_matrix = keras.backend.reshape(predicted_probas[:-1],[-1, len(tokens)])
answers_matrix = keras.backend.one_hot(keras.backend.flatten(input_sequence[1:]), n_tokens)

To optimize the loss we need a bit of TensorFlow, because we've gone too deep into the backend. 

In [None]:
loss = #<define loss as categorical crossentropy>
optimize = tf.train.AdamOptimizer().minimize(loss) # It's the way Adam optimizer is called in TensorFlow

### The training loop
Docs about tf.Session: https://www.tensorflow.org/api_docs/python/tf/Session
Actually, our network is built in TensorFlow, but using high-level keras API. So we need some tf stuff.

Calling tf.Session (or tf.InteractiveSession)  creates the session, which is equal to "runtime" in terms of tf. All the varaibles are created within this session and available in it.

In [None]:
from IPython.display import clear_output
from random import sample

sess = tf.InteractiveSession()
keras.backend.set_session(sess) 
sess.run(tf.global_variables_initializer())
history = []

In [None]:
for i in range(2000):
    batch = to_matrix(sample(names,32), max_len=MAX_LENGTH) #<sample a batch of names in a matrix format>
    loss_i,_ = sess.run([loss, optimize], {input_sequence: batch}) #<compute loss and perform gradient descent on that batch>
    
    history.append(loss_i.mean())
    if (i+1)%100==0:
        clear_output(True)
        plt.plot(history,label='loss')
        plt.legend()
        plt.show()


### RNN: sampling
Once we've trained our network a bit, let's get to actually generating stuff. All we need is the `rnn_one_step` function you have written above.

In [None]:
x_t = keras.backend.placeholder(shape=(None,), dtype='int32')
h_t = keras.backend.variable(np.zeros([1,rnn_num_units],'float32'))

next_probs,next_h = rnn_one_step(x_t,h_t)

In [None]:
def generate_sample(seed_phrase=' ',max_length=MAX_LENGTH, sess=sess):
    '''
    The function generates text given a phrase of length at least SEQ_LENGTH.
        
    parameters:
        The phrase is set using the variable seed_phrase
        The optional input "N" is used to set the number of characters of text to predict.     
    '''
    x_sequence = [token_to_id[token] for token in seed_phrase]
    sess.run(tf.assign(h_t,h_t.initial_value))
    
    #feed the seed phrase, if any
    for ix in x_sequence[:-1]:
         sess.run(tf.assign(h_t,next_h),{x_t:[ix]})
    
    #start generating
    for _ in range(max_length-len(seed_phrase)):
        x_probs,_ = sess.run([next_probs,tf.assign(h_t,next_h)],{x_t:[x_sequence[-1]]})
        x_sequence.append(np.random.choice(n_tokens,p=x_probs[0]))
        
    return ''.join([tokens[ix] for ix in x_sequence])

In [None]:
for _ in range(10):
    print(generate_sample())

In [None]:
for _ in range(50):
    print(generate_sample(' Murz'))

### Try it out!
You've just implemented a recurrent language model that can be tasked with generating any kind of sequence, so there's plenty of data you can try it on:

* Novels/poems/songs of your favorite author
* News titles/clickbait titles
* Source code of Linux or Tensorflow
* Molecules in [smiles](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system) format
* Melody in notes/chords format
* Ikea catalog titles
* Pokemon names
* Cards from Magic, the Gathering / Hearthstone

If you're willing to give it a try, here's what you wanna look at:
* Current data format is a sequence of lines, so a novel can be formatted as a list of sentences. Alternatively, you can change data preprocessing altogether.
* While some datasets are readily available, others can only be scraped from the web. Try `Selenium` or `Scrapy` for that.
* Make sure MAX_LENGTH is adjusted for longer datasets.
* More complex tasks require larger RNN architecture, try more neurons or several layers. It would also require more training iterations.
* Long-term dependencies in music, novels or molecules are better handled with LSTM or GRU

__Good hunting!__

Please, leave the http://bit.ly/feedback_ml_dl_cnn!