In [1]:
import torch
import string
import random
import unicodedata
import numpy as np

from glob import glob
from torch import nn
from torch import optim
from torchtext import data

In [2]:
data_dir = "/home/pervinco/Datasets/name_classification/names"
epochs = 500
learning_rate = 0.005

In [3]:
all_letters = string.ascii_letters + " .,;'" ## abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
print(f"num of letters {len(all_letters)}")
print(all_letters)

num of letters 57
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'


In [4]:
# 유니코드 문자열을 ASCII로 변환, https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters)

In [5]:
category_lines, all_categories = {}, []
files = glob(f"{data_dir}/*.txt")
print(len(files))

for file in files:
    category = file.split('/')[-1].split('.')[0]
    all_categories.append(category)

    with open(file, 'r') as f:
        lines = [x.strip() for x in f.readlines()]
        lines = [unicodeToAscii(line) for line in lines]
        category_lines[category] = lines

print(f"num categories : {len(all_categories)}")

18
num categories : 18


In [6]:
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim): ## 57, 128, 18
        super(RNN, self).__init__()

        self.hidden_dim = hidden_dim
        self.input_to_hidden = nn.Linear(input_dim + hidden_dim, hidden_dim) ## 185, 128
        self.input_to_output = nn.Linear(input_dim + hidden_dim, output_dim) ## 185, 18
        # self.softmax = nn.LogSoftmax(dim=1)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, input, hidden):
        x = torch.cat((input, hidden), dim=1)
        hidden = self.input_to_hidden(x)
        output = self.input_to_output(x)

        return output, hidden

In [7]:
input_dim = len(all_letters) ## 57
output_dim = len(all_categories) ## 18
hidden_dim = 128

model = RNN(input_dim, hidden_dim, output_dim)

In [8]:
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, len(all_letters))
    for idx, letter in enumerate(line):
        tensor[idx][0][all_letters.find(letter)] = 1

    return tensor

test_line = category_lines['English'][7] ## Abramas
print(test_line)

test_result = line_to_tensor(test_line) ## 6, 1, 57 One-Hot Encoded vectors stack.
print(test_result.shape)

Abrams
torch.Size([6, 1, 57])


In [9]:
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]


def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = line_to_tensor(line)
    
    return category, line, category_tensor, line_tensor

In [10]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
h0 = torch.zeros(1, hidden_dim)

for epoch in range(epochs):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    
    optimizer.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = model(line_tensor[i], h0)

    loss = criterion(output, category_tensor)
    loss.backward()
    optimizer.step()

    pred_idx = output.argmax()
    pred_label = all_categories[pred_idx]
    
    correct = 'O' if pred_label == category else 'X'
    print(f"[{epoch}/{epochs}] ({correct}) pred : {pred_label}, gt : {category}, loss : {loss.item():.4f}")

[0/500] (X) pred : Italian, gt : Vietnamese, loss : 2.8103
[1/500] (X) pred : Vietnamese, gt : Greek, loss : 2.8860
[2/500] (X) pred : Scottish, gt : Portuguese, loss : 2.9888
[3/500] (X) pred : German, gt : Spanish, loss : 2.8725
[4/500] (X) pred : German, gt : English, loss : 2.9320
[5/500] (X) pred : German, gt : Scottish, loss : 2.8559
[6/500] (X) pred : German, gt : Korean, loss : 2.8122
[7/500] (O) pred : German, gt : German, loss : 2.7539
[8/500] (X) pred : Vietnamese, gt : Chinese, loss : 2.8933
[9/500] (X) pred : Vietnamese, gt : Greek, loss : 2.8999
[10/500] (X) pred : Vietnamese, gt : Scottish, loss : 2.8167
[11/500] (X) pred : Vietnamese, gt : French, loss : 2.9550
[12/500] (X) pred : Vietnamese, gt : Czech, loss : 2.9237
[13/500] (O) pred : German, gt : German, loss : 2.7152
[14/500] (X) pred : Vietnamese, gt : Spanish, loss : 2.8932
[15/500] (X) pred : Vietnamese, gt : Scottish, loss : 2.7755
[16/500] (O) pred : Scottish, gt : Scottish, loss : 2.7404
[17/500] (O) pred : V