# Classifying Names with a Character-Level RNN

[Tutorial](http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html)  
[recurrent net](http://pytorch.org/tutorials/beginner/former_torchies/nn_tutorial.html#example-2-recurrent-net)

- 単語(人の名前)を与えると、その言語を予測するRNNを作りたい
- 各文字がRNNの時間ごとの入力となる
- ../data/names/<言語名>.txtには、1行ごとに名前(1word)が書かれている

## Preparing the Data

In [None]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
# 後でTensorの長さとして使用する
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
# アルファベットに収まらないものも言語によってはあるので、ASCIIにする
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

In [None]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    # ファイル名がカテゴリ名
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    # asciiに変換したもの
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [None]:
# Italianの最初の5行だけ試しに出力
print(category_lines['Italian'][:5])

## Turning Names into Tensors

In [None]:
import torch

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
# ある単語の文字は、Tensorの1行分となる。そのsizeが[1, n__letters] (one-hot vector)
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

In [None]:
print(letterToTensor('J'))

In [None]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
# 1はbatch sizeが今回は1のため
# batch_sizeが2次元目なのは、時間tごとにまとめて送るため
# だから1次元目はtにしたい
# tを揃えるにはpaddingが必要
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [None]:
print(lineToTensor('Jones').size())

## Creating the Network

![](https://i.imgur.com/Z2xbySO.png)

- 黄色はTensor
- Moduleの子クラス関連:
  - 青はlayer
  - 緑は活性化関数
  
> Since the state of the network is held in the graph and not in the layers, you can simply create an nn.Linear and reuse it over and over again for the recurrence.

In [None]:
import torch.nn as nn
from torch.autograd import Variable

# 今回はRNNを自前で作る
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        # 隠れ層の大きさ
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    # 図のように、inputとhiddenが入力になる
    def forward(self, input, hidden):
        # combined部分は本当に単にconcatしているだけ
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)


In [None]:
# 1文字分をforwardしてみる
input = Variable(letterToTensor('A'))
hidden = Variable(torch.zeros(1, n_hidden))

output, next_hidden = rnn(input, hidden)

In [None]:
# lineToTensorを使うことでTensorを作る回数を減らした上で、1文字分をforwardしてみる
# 普通はさらにbatchにして効率化する
input = Variable(lineToTensor('Albert'))
hidden = Variable(torch.zeros(1, n_hidden))

# 1文字を投入
output, next_hidden = rnn(input[0], hidden)
print(output)

## Prepare for training

In [None]:
def categoryFromOutput(output):
    top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
    category_i = top_i[0][0]
    return all_categories[category_i], category_i

# おそらくこれでも同等
def my_categoryFromOutput(output):
    value, index = torch.max(output.data, 1)    
    category = index[0][0]
    return all_categories[category], category

print(categoryFromOutput(output))

In [None]:
import random

# リストを与え、その中から1つをランダムに選ぶ
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    # ランダムにカテゴリーを1つ選ぶ
    category = randomChoice(all_categories)
    # カテゴリー中の行をランダムに選ぶ
    line = randomChoice(category_lines[category])
    
    # カテゴリーと行のTensor作成
    category_tensor = Variable(torch.LongTensor([all_categories.index(category)]))
    line_tensor = Variable(lineToTensor(line))
    return category, line, category_tensor, line_tensor

for i in range(10):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line)

## Training the Network

Each loop of training will:

- Create input and target tensors
- Create a zeroed initial hidden state
- Read each letter in and
  - Keep hidden state for next letter
- Compare final output to target
- Back-propagate
- Return the output and loss

In [None]:
use_gpu = True

In [None]:
criterion = nn.NLLLoss()
if use_gpu:
    criterion.cuda()

In [None]:
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()
    if use_gpu:
        # zero_gradの後にcuda()を呼び、modelのparametersをGPUに転送する
        rnn.cuda()
    for i in range(line_tensor.size()[0]): # 文字を順にforward
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    # optimizerを使わず自分で重みを更新している
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.data[0]

In [None]:
import time
import math

n_iters = 100000
print_every = 5000
plot_every = 1000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        # 可視化に使う
        all_losses.append(current_loss / plot_every)
        current_loss = 0

## Plotting the Results

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

## Evaluating the Results

In [None]:
# Keep track of correct guesses in a confusion matrix
# 行が本来の答えで、列が予測
confusion = torch.zeros(n_categories, n_categories)
n_confusion = 10000

# Just return an output given a line
def evaluate(line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output

# Go through a bunch of examples and record which are correctly guessed
# 予測して正解と比較し、matrixを埋める
for i in range(n_confusion):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output = evaluate(line_tensor)
    guess, guess_i = categoryFromOutput(output)
    category_i = all_categories.index(category)
    # 該当する部分を+1する
    confusion[category_i][guess_i] += 1

# Normalize by dividing every row by its sum
for i in range(n_categories):
    confusion[i] = confusion[i] / confusion[i].sum()

# Set up plot
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(confusion.numpy())
fig.colorbar(cax)

# Set up axes
ax.set_xticklabels([''] + all_categories, rotation=90)
ax.set_yticklabels([''] + all_categories)

# Force label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

# sphinx_gallery_thumbnail_number = 2
plt.show()


## Running on User Input

In [None]:
def predict(input_line, n_predictions=3):
    print('\n> %s' % input_line)
    output = evaluate(Variable(lineToTensor(input_line)))

    # Get top N categories
    topv, topi = output.data.topk(n_predictions, 1, True)
    predictions = []

    for i in range(n_predictions):
        value = topv[0][i]
        category_index = topi[0][i]
        print('(%.2f) %s' % (value, all_categories[category_index]))
        predictions.append([value, all_categories[category_index]])

predict('Dovesky')
predict('Jackson')
predict('Satoshi')