In [1]:
from os import path, listdir
import glob
DATA_PATH = 'data/fb_test/names'

In [2]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

In [3]:
category_lines = {}
all_categories = []

for filename in listdir(DATA_PATH):
    full_path = path.join(DATA_PATH, filename)
    category = path.splitext(path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(full_path)
    category_lines[category] = lines

n_categories = len(all_categories)

In [4]:
import numpy as np

def letterToIndex(letter):
    return all_letters.find(letter) + 1

def lineToTensor(line):
    tensor = np.zeros((len(line)))
    for li, letter in enumerate(line):
        tensor[li] = letterToIndex(letter)
    return tensor

def batchify(lines):
    seq_lens = [len(line) for line in lines]
    max_len = max(seq_lens)
    tensor = np.zeros((len(lines), max_len))
    for ix, line in enumerate(lines):
        line_tensor = lineToTensor(line)
        tensor[ix,:len(line_tensor)] = line_tensor
    return tensor, seq_lens

In [5]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [6]:
X_train = []
y_train = []

for category in category_lines:
    X_train.extend(category_lines[category])
    y_train.extend([all_categories.index(category)] * len(category_lines[category]))

import random
shuffle_idx = list(range(len(X_train)))
random.shuffle(shuffle_idx)
X_train = [X_train[idx] for idx in shuffle_idx]
y_train = [y_train[idx] for idx in shuffle_idx]

Some training conditions

In [7]:
batch_size = 64

n_iters = 100
print_every = 5
plot_every = 1

# RNN Text classifier, Pytorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

print(torch.__version__)

In [None]:
class TorchRNNClassifier(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(TorchRNNClassifier, self).__init__()
        
        self.embedding = nn.Embedding(n_letters + 1, hidden_size)
        self.dropout = nn.Dropout(.2)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, seq_lens):
        emb = self.embedding(input)
        emb = self.dropout(emb)
        emb = pack_padded_sequence(emb, seq_lens, batch_first=True, enforce_sorted=False)
        output = self.rnn(emb)[0]
        output = pad_packed_sequence(output, batch_first=True)[0]
        output = torch.max(output, 1)[0].squeeze(1)
        output = self.linear(output)
        
        return output

In [None]:
model_pytorch = TorchRNNClassifier(64, n_categories)

In [None]:
# batch, seq_lens = batchify(['Ngoc', 'James'])
# batch = torch.LongTensor(batch)
# print(model_pytorch(batch, seq_lens).size())
# print(model_pytorch(batch, seq_lens))

In [None]:
criterion = nn.CrossEntropyLoss()

def train_pytorch(model, optimizer, lines, categories):
    model.zero_grad()

    X_tensor, seq_lens = batchify(lines)
    X_tensor = torch.LongTensor(X_tensor)
    y_tensor = torch.LongTensor(categories)
    
    output = model(X_tensor, seq_lens)

    loss = criterion(output, y_tensor)
    loss.backward()
    
    optimizer.step()
    
    return output, loss.item()

In [None]:
optimizer_pytorch = optim.Adam(model_pytorch.parameters())

# Keep track of losses for plotting
current_loss = 0
all_losses = []

start = time.time()

for iter in range(1, n_iters + 1):
    correct = 0
    iter_loss = 0
    
    for start_idx in range(0, len(X_train), batch_size):
        batch_len = min(len(X_train) - start_idx, batch_size)
        train_lines = X_train[start_idx:start_idx + batch_len]
        train_categories = y_train[start_idx:start_idx + batch_len]
        
        model_pytorch.train()
        output, loss = train_pytorch(model_pytorch, optimizer_pytorch, train_lines, train_categories)
        current_loss += loss
        iter_loss += loss
        
        batch_correct = output.topk(1, dim=1)[1].squeeze(1) == torch.LongTensor(train_categories)
        correct += batch_correct.float().sum()

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        accuracy = correct / len(X_train)
        print('%d %d%% (%s) %.4f - accuracy: %.2f' % (iter, iter / n_iters * 100, timeSince(start), iter_loss, accuracy))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0
        
model_pytorch.eval()

# RNN Text classifier, Tensorflow

In [10]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

tf.enable_eager_execution()
tf.__version__

'1.12.0'

In [11]:
class TFRNNClassifier(tf.keras.Model):
    def __init__(self, hidden_size, output_size):
        super(TFRNNClassifier, self).__init__()
        
        self.embedding = layers.Embedding(n_letters + 1, hidden_size)
        self.dropout = layers.Dropout(.2)
        self.rnn = layers.GRU(hidden_size, return_sequences=True)
        self.linear = layers.Dense(output_size)
        
    def __call__(self, input, seq_lens):
        mask = tf.sequence_mask(seq_lens)
        emb = self.embedding(input)
        emb = self.dropout(emb)
        output = self.rnn(emb)
        output = tf.reduce_max(output, axis=1)
        output = self.linear(output)
        
        return output

In [12]:
model_tf = TFRNNClassifier(64, n_categories)
# batch, seq_lens = batchify(['Ngoc', 'James'])
# batch = tf.convert_to_tensor(batch, np.int32)
# print(tf_model(batch, seq_lens))

In [13]:
def train_tf(model, optimizer, lines, categories):
    X_tensor, seq_lens = batchify(lines)
    X_tensor = tf.convert_to_tensor(X_tensor, np.int32)
    y_tensor = to_categorical(categories, num_classes=n_categories)
    
    with tf.GradientTape() as tape:
        output = model(X_tensor, seq_lens)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=y_tensor))
        
        variables = model.variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
    
    return output, loss

In [14]:
optimizer_tf = tf.train.AdamOptimizer()

# Keep track of losses for plotting
current_loss = 0
all_losses = []

start = time.time()

for iter in range(1, n_iters + 1):
    correct = 0
    iter_loss = 0
    
    for start_idx in range(0, len(X_train), batch_size):
        batch_len = min(len(X_train) - start_idx, batch_size)
        train_lines = X_train[start_idx:start_idx + batch_len]
        train_categories = y_train[start_idx:start_idx + batch_len]
        
        output, loss = train_tf(model_tf, optimizer_tf, train_lines, train_categories)
        current_loss += loss
        iter_loss += loss
        
        batch_correct = tf.equal(
            tf.squeeze(tf.math.top_k(output, k=1)[1], axis=1),
            tf.convert_to_tensor(train_categories, np.int32)
        )
        correct += tf.reduce_sum(tf.to_float(batch_correct))

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        accuracy = correct / len(X_train)
        print('%d %d%% (%s) %.4f - accuracy: %.2f' % (iter, iter / n_iters * 100, timeSince(start), iter_loss, accuracy))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

5 5% (4m 27s) 285.6721 - accuracy: 0.73
10 10% (9m 25s) 205.2093 - accuracy: 0.80
15 15% (14m 29s) 169.9141 - accuracy: 0.83
20 20% (19m 16s) 147.9848 - accuracy: 0.85
25 25% (24m 10s) 131.5602 - accuracy: 0.87
30 30% (28m 53s) 117.9902 - accuracy: 0.88
35 35% (33m 24s) 106.8181 - accuracy: 0.89
40 40% (38m 16s) 96.6020 - accuracy: 0.90
45 45% (42m 52s) 87.6815 - accuracy: 0.91
50 50% (47m 15s) 79.1372 - accuracy: 0.92
55 55% (51m 36s) 71.9096 - accuracy: 0.93
60 60% (55m 57s) 67.9141 - accuracy: 0.93
65 65% (60m 31s) 62.9381 - accuracy: 0.94
70 70% (65m 2s) 57.4143 - accuracy: 0.94
75 75% (69m 46s) 53.6563 - accuracy: 0.95
80 80% (74m 54s) 47.8101 - accuracy: 0.95
85 85% (79m 15s) 43.9958 - accuracy: 0.96
90 90% (83m 37s) 41.5539 - accuracy: 0.96
95 95% (88m 0s) 40.5338 - accuracy: 0.96
100 100% (92m 21s) 38.1611 - accuracy: 0.96
