In [1]:
from os import path, listdir
import glob
DATA_PATH = 'data/fb_test/names'

In [2]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

In [3]:
category_lines = {}
all_categories = []

for filename in listdir(DATA_PATH):
    full_path = path.join(DATA_PATH, filename)
    category = path.splitext(path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(full_path)
    category_lines[category] = lines

n_categories = len(all_categories)

In [4]:
import numpy as np

def letterToIndex(letter):
    return all_letters.find(letter) + 1

def lineToTensor(line):
    tensor = np.zeros((len(line)))
    for li, letter in enumerate(line):
        tensor[li] = letterToIndex(letter)
    return tensor

def batchify(lines):
    seq_lens = [len(line) for line in lines]
    max_len = max(seq_lens)
    tensor = np.zeros((len(lines), max_len))
    for ix, line in enumerate(lines):
        line_tensor = lineToTensor(line)
        tensor[ix,:len(line_tensor)] = line_tensor
    return tensor, seq_lens

In [5]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [6]:
X_train = []
y_train = []

for category in category_lines:
    X_train.extend(category_lines[category])
    y_train.extend([all_categories.index(category)] * len(category_lines[category]))

import random
shuffle_idx = list(range(len(X_train)))
random.shuffle(shuffle_idx)
X_train = [X_train[idx] for idx in shuffle_idx]
y_train = [y_train[idx] for idx in shuffle_idx]

Some training conditions

In [7]:
batch_size = 64

n_iters = 100
print_every = 5
plot_every = 1

# RNN Text classifier, Pytorch

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

print(torch.__version__)

1.0.0.dev20190209


In [9]:
class TorchRNNClassifier(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(TorchRNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(n_letters + 1, hidden_size)
        self.dropout = nn.Dropout(.2)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, seq_lens):
        emb = self.embedding(input)
        emb = self.dropout(emb)
        emb = pack_padded_sequence(emb, seq_lens, batch_first=True, enforce_sorted=False)
        output = self.rnn(emb)[0]
        output = pad_packed_sequence(output, batch_first=True)[0]
        output = torch.max(output, 1)[0].squeeze(1)
        output = self.linear(output)
        
        return output

In [10]:
model_pytorch = TorchRNNClassifier(64, n_categories)

In [11]:
# batch, seq_lens = batchify(['Ngoc', 'James'])
# batch = torch.LongTensor(batch)
# print(model_pytorch(batch, seq_lens).size())
# print(model_pytorch(batch, seq_lens))

In [12]:
criterion = nn.CrossEntropyLoss()

def train_pytorch(model, optimizer, lines, categories):
    model.zero_grad()

    X_tensor, seq_lens = batchify(lines)
    X_tensor = torch.LongTensor(X_tensor)
    y_tensor = torch.LongTensor(categories)
    
    output = model(X_tensor, seq_lens)

    loss = criterion(output, y_tensor)
    loss.backward()
    
    optimizer.step()
    
    return output, loss.item()

In [13]:
optimizer_pytorch = optim.Adam(model_pytorch.parameters())

# Keep track of losses for plotting
current_loss = 0
all_losses = []

start = time.time()

for iter in range(1, n_iters + 1):
    correct = 0
    iter_loss = 0
    
    for start_idx in range(0, len(X_train), batch_size):
        batch_len = min(len(X_train) - start_idx, batch_size)
        train_lines = X_train[start_idx:start_idx + batch_len]
        train_categories = y_train[start_idx:start_idx + batch_len]
        
        model_pytorch.train()
        output, loss = train_pytorch(model_pytorch, optimizer_pytorch, train_lines, train_categories)
        current_loss += loss
        iter_loss += loss
        
        batch_correct = output.topk(1, dim=1)[1].squeeze(1) == torch.LongTensor(train_categories)
        correct += batch_correct.float().sum()

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        accuracy = correct / len(X_train)
        print('%d %d%% (%s) %.4f - accuracy: %.2f' % (iter, iter / n_iters * 100, timeSince(start), iter_loss, accuracy))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0
        
model_pytorch.eval()

5 5% (0m 32s) 277.0938 - accuracy: 0.74
10 10% (1m 1s) 210.9891 - accuracy: 0.80
15 15% (1m 31s) 180.9646 - accuracy: 0.82
20 20% (2m 3s) 163.6553 - accuracy: 0.84
25 25% (2m 31s) 151.1783 - accuracy: 0.85
30 30% (2m 59s) 140.6731 - accuracy: 0.86
35 35% (3m 31s) 133.0128 - accuracy: 0.87
40 40% (4m 0s) 127.0053 - accuracy: 0.87
45 45% (4m 29s) 121.0481 - accuracy: 0.88
50 50% (4m 59s) 116.7010 - accuracy: 0.88
55 55% (5m 31s) 112.4268 - accuracy: 0.88
60 60% (6m 3s) 108.8677 - accuracy: 0.89
65 65% (6m 36s) 105.0301 - accuracy: 0.89
70 70% (7m 5s) 102.7741 - accuracy: 0.89
75 75% (7m 36s) 101.0709 - accuracy: 0.90
80 80% (8m 8s) 96.6039 - accuracy: 0.90
85 85% (8m 40s) 95.6565 - accuracy: 0.90
90 90% (9m 12s) 93.9829 - accuracy: 0.90
95 95% (9m 43s) 90.7011 - accuracy: 0.91
100 100% (10m 14s) 89.7329 - accuracy: 0.90


TorchRNNClassifier(
  (embedding): Embedding(58, 64)
  (dropout): Dropout(p=0.2)
  (rnn): GRU(64, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=18, bias=True)
)

In [14]:
def predict(lines, n_predictions):
    with torch.no_grad():
        X_tensor, seq_lens = batchify(lines)
        X_tensor = torch.LongTensor(X_tensor)
        result = model_pytorch(X_tensor, seq_lens)
        
        topv, topi = result.topk(n_predictions, 1, True)
        predictions = []
        
        for idx, input_line in enumerate(lines):
            print('\n> %s' % input_line)
            
            for i in range(n_predictions):
                value = topv[idx][i].item()
                category_index = topi[idx][i].item()
                print('(%.2f) %s' % (value, all_categories[category_index]))
                predictions.append([value, all_categories[category_index]])

In [15]:
predict(['Dovesky', 'Jackson', 'Ngoc', 'Satoshi'], 3)


> Dovesky
(4.51) Russian
(2.52) Czech
(-2.26) English

> Jackson
(5.13) English
(1.81) Scottish
(-0.35) Russian

> Ngoc
(3.02) Vietnamese
(0.97) English
(0.77) Czech

> Satoshi
(12.42) Japanese
(5.93) Arabic
(0.08) Greek
