## Preparing the Data
We will Download the [data -- fill me up]() and extract it to the current directory.
Included in the ``data/names`` directory are 18 text files named as
"[Language].txt". Each file contains a bunch of names, one name per
line, mostly romanized (but we still need to convert from Unicode to
ASCII).

We'll end up with a dictionary of lists of names per language,
``{language: [names ...]}``. The generic variables "category" and "line"
(for language and name in our case) are used for later extensibility.


In [37]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import zipfile
import unicodedata
import string
import random
import time
import math

import mxnet as mx
from mxnet.gluon import nn
from mxnet.gluon import rnn
from mxnet import gluon, autograd

### Change data location below

In [38]:
data_dir_name='./char-rnn-data'

In [39]:
def download_data(data_dir_name):
    fname = mx.test_utils.download(url='https://download.pytorch.org/tutorial/data.zip', dirname=data_dir_name, overwrite=False)
    zip = zipfile.ZipFile(fname)
    zip.extractall(data_dir_name)
    zip.close()

download_data(data_dir_name)

In [40]:
all_lines = []
with open(data_dir_name + '/data.txt', 'r' ) as d:
    all_lines = [l.strip() for l in d.readlines()]

import random
# we will assume a batch size of 1 and shuffle all lines, typically the shuffling should happen 
# for each mini-batch to preserve temporal data.
random.shuffle(all_lines)

all_lines = [line.split(',') for line in all_lines]
all_lines = filter(None, all_lines) 

names_all = [l[0].strip() for l in all_lines]
all_lines = filter(None, all_lines) 

categories_all = [l[1].strip() for l in all_lines]
categories_all = filter(None, categories_all) 

categories = list(set(categories_all))
n_categories = len(categories)

# we will choose 10% of data for evaluation and 10% for test
num_eval = int(0.10 * len(names_all))

train_data = names_all[0:-num_eval]
eval_data = names_all[-num_eval:] 

num_train = len(train_data)

train_label = categories_all[0:-num_eval]
eval_label = categories_all[-num_eval:] 

vocab = string.ascii_letters + " .,;'"
vocab_size = len(vocab)

print('num_data:', len(names_all))
print('num_train:', len(train_data))
print('num_eval:', len(eval_data))
print('num_train_label:', len(train_label))
print('num_eval_label:', len(eval_label))

print('vocab_size:', vocab_size)
print('num_labels:', n_categories)

num_data: 20074
num_train: 18067
num_eval: 2007
num_train_label: 18067
num_eval_label: 2007
vocab_size: 57
num_labels: 18


In [41]:
train_data_onehot = [mx.nd.one_hot(mx.nd.array([vocab.find(ch) for ch in line]), vocab_size) for line in train_data]
train_label_onehot = [mx.nd.one_hot(mx.nd.array([categories.index(label)]), n_categories) for label in train_label]

eval_data_onehot = [mx.nd.one_hot(mx.nd.array([vocab.find(ch) for ch in line]), vocab_size) for line in eval_data]
eval_label_onehot = [mx.nd.one_hot(mx.nd.array([categories.index(label)]), n_categories) for label in eval_label]

In [42]:
print('train_data.len', len(train_data_onehot))
print('train_data.shape', train_data_onehot[0].shape)
print('num_train:', num_train)

print('eval_data.len', len(eval_data_onehot), 'eval_label.len', len(eval_label_onehot))
print('eval_data.shape', eval_data_onehot[0].shape, 'eval_label.shape', eval_label_onehot[0].shape)
print('num_eval:', num_eval)

train_data.len 18067
train_data.shape (7L, 57L)
num_train: 18067
eval_data.len 2007 eval_label.len 2007
eval_data.shape (6L, 57L) eval_label.shape (1L, 18L)
num_eval: 2007


In [43]:
num_hidden = 100
num_layers = 2
batch_size = 1
num_epochs = 20

In [44]:
ctx = mx.gpu(0)

In [45]:
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [46]:
class RNN(gluon.Block):
    def __init__(self, num_layers, num_hidden, num_output_class, dropout_prob=0.5, **kwargs):
        super(RNN, self).__init__(**kwargs)
        self.num_layers = num_layers
        self.dropout_prob = dropout_prob
        
        with self.name_scope():
            self.drop = nn.Dropout(dropout_prob)            
            self.stack = rnn.SequentialRNNCell()
            for n in range(num_layers):
                self.stack.add(rnn.LSTMCell(hidden_size=num_hidden, prefix='lstm_l%d_'%n))
            self.fc = nn.Dense(num_output_class, in_units=num_hidden)

    def forward(self, inputs, seq_length):
#         print('inputs.shape', inputs.shape)                
        lstm_output, hidden = self.stack.unroll(seq_length, inputs, layout='NTC', merge_outputs=True)
#         print('lstm_output.shape', lstm_output.shape)        
        drop_output = self.drop.forward(lstm_output)
#         print('drop_output', drop_output.shape)
        
        fc_input = mx.nd.sum(drop_output, axis=1)
        fc_input = mx.nd.divide(fc_input, seq_length)
#         print('fc_input.shape', fc_input.shape)
        fc_output = self.fc(fc_input)        
#         print('fc_output.shape', fc_output.shape)              
        return fc_output

In [47]:
model = RNN(num_layers, num_hidden, num_output_class=n_categories)
model.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': 0.001})

In [48]:
def eval():
    total_loss = 0.0
    ntotal = 0
    hidden = model.stack.begin_state(func=mx.nd.zeros, batch_size=batch_size, ctx=ctx)
    
    for index in range(num_eval):
        data = eval_data_onehot[index].as_in_context(ctx)
        data = data.reshape(((1,) + data.shape))
        
        target = eval_label_onehot[index].as_in_context(ctx)
        target = mx.nd.split(data=target, axis=0, num_outputs=1, squeeze_axis=True)

        output = model.forward(data, data.shape[1])
        output = output.reshape((target.shape[0], 1))
        
        L = mx.nd.softmax_cross_entropy(output, target)

        total_loss += mx.nd.sum(L).asscalar()

    return total_loss/num_eval

In [49]:
losses = []
def train(print_every=500):
    start = time.time()
    for epoch in range(1, num_epochs):
        total_loss = 0.0
        for index in range(num_train):
            
            hidden = model.stack.begin_state(func=mx.nd.zeros, batch_size=batch_size, ctx=ctx)
            with autograd.record():
#                 index = random.randint(0, num_train_data - 1)
                data = train_data_onehot[index].as_in_context(ctx)
                # since we do not have a batch size we'll reshape it to a batch_size of 1.
                data = data.reshape(((1,) + data.shape))                

                target = train_label_onehot[index].as_in_context(ctx)
                target = mx.nd.split(data=target, axis=0, num_outputs=1, squeeze_axis=True)
#                 print('target_shape.', target.shape)
                
                output = model.forward(data, data.shape[1])
#                 print('forward_output.shape', output.shape)
                
                output = output.reshape((target.shape[0], 1))
    
#                 print('forward_output.reshape.shape', output.shape)
                
                softmax_out = mx.nd.softmax(output, axis=0)
                L = mx.nd.softmax_cross_entropy(data=output, label=target)
                L.backward()
            
            trainer.step(batch_size)

            total_loss += L.asscalar()
        
        val_loss = eval()
            
        losses.append(total_loss)
        print('[Epoch %d] Training loss=%f, Val loss=%f, time=%s'%(epoch, total_loss/num_train, val_loss, timeSince(start)))

In [None]:
train()