## Preparing the Data

In [11]:
import glob

In [39]:
path = "./data/data-for-char-rnn/names/*.txt"
file_names = glob.glob(path) # find file names: ./[Language].txt
print(file_names)
all_letters = string.ascii_letters + " .,;'"
num_letters = len(all_letters)
print(all_letters)

['./data/data-for-char-rnn/names/Scottish.txt', './data/data-for-char-rnn/names/German.txt', './data/data-for-char-rnn/names/Italian.txt', './data/data-for-char-rnn/names/Dutch.txt', './data/data-for-char-rnn/names/English.txt', './data/data-for-char-rnn/names/Portuguese.txt', './data/data-for-char-rnn/names/Chinese.txt', './data/data-for-char-rnn/names/Japanese.txt', './data/data-for-char-rnn/names/Greek.txt', './data/data-for-char-rnn/names/Vietnamese.txt', './data/data-for-char-rnn/names/Korean.txt', './data/data-for-char-rnn/names/Polish.txt', './data/data-for-char-rnn/names/Czech.txt', './data/data-for-char-rnn/names/Arabic.txt', './data/data-for-char-rnn/names/Russian.txt', './data/data-for-char-rnn/names/French.txt', './data/data-for-char-rnn/names/Irish.txt', './data/data-for-char-rnn/names/Spanish.txt']
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'


In [13]:
import unicodedata
import string

In [36]:
def unicode2ascii(s):
    """Turn a Unicode string to plain ASCII
    http://stackoverflow.com/a/518232/2809427"""
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [21]:
print(unicode2ascii('Ślusàrski'))

Slusarski


In [29]:
def _load_name_data(file_name):
    data = []
    with open(file_name, 'r') as f:
        for line in f:
            data.append(unicode2ascii(line.strip()))
    return data

In [30]:
file_name = './data/data-for-char-rnn/names/Chinese.txt'
data = _load_name_data(file_name)
print(data[:5])

['Ang', 'AuYong', 'Bai', 'Ban', 'Bao']


In [31]:
def load_data(file_names):
    data = dict()
    for f in file_names:
        category = f.split('/')[-1].split('.')[0]
        data[category] = _load_name_data(f)
    return data

In [52]:
data = load_data(file_names)
num_category = len(data)
print(num_category)
print(data.keys())
print(data['Chinese'][:5])

18
dict_keys(['Scottish', 'German', 'Italian', 'Dutch', 'English', 'Portuguese', 'Chinese', 'Japanese', 'Greek', 'Vietnamese', 'Korean', 'Polish', 'Czech', 'Arabic', 'Russian', 'French', 'Irish', 'Spanish'])
['Ang', 'AuYong', 'Bai', 'Ban', 'Bao']


In [38]:
def letter2index(letter):
    return all_letters.find(letter)

print(letter2index('a'))

0


In [41]:
import torch

def letter2tensor(letter):
    tensor = torch.zeros(1, num_letters)
    tensor[0][letter2index(letter)] = 1
    return tensor

print(letter2tensor('a'))



Columns 0 to 12 
    1     0     0     0     0     0     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 26 to 38 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 39 to 51 
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 52 to 56 
    0     0     0     0     0
[torch.FloatTensor of size 1x57]



In [44]:
def name2tensor(name):
    # [name_len, batch_size, letters_size)
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, letter in enumerate(name):
        tensor[i] = letter2tensor(letter)
    return tensor

print(line2tensor('hang').size())

torch.Size([4, 1, 57])


## Define the Network: Simple RNN
![SimpleRNN](./pictures/simpleRNN.png)


In [49]:
import torch.nn as nn
from torch.autograd import Variable

In [54]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.i2o= nn.Linear(input_size + hidden_size, output_size)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.softmax = nn.LogSoftmax()
    
    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        return self.softmax(output)
    
    def init_hidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

hidden_size = 128
rnn = RNN(num_letters, hidden_size, num_category)