In [80]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

In [81]:
def findFiles(path): return glob.glob(path)

In [82]:
print(findFiles("../../dat/pyt/names/*.txt"))

['../../dat/pyt/names/Czech.txt', '../../dat/pyt/names/German.txt', '../../dat/pyt/names/Arabic.txt', '../../dat/pyt/names/Japanese.txt', '../../dat/pyt/names/Chinese.txt', '../../dat/pyt/names/Vietnamese.txt', '../../dat/pyt/names/Russian.txt', '../../dat/pyt/names/French.txt', '../../dat/pyt/names/Irish.txt', '../../dat/pyt/names/English.txt', '../../dat/pyt/names/Spanish.txt', '../../dat/pyt/names/Greek.txt', '../../dat/pyt/names/Italian.txt', '../../dat/pyt/names/Portuguese.txt', '../../dat/pyt/names/Scottish.txt', '../../dat/pyt/names/Dutch.txt', '../../dat/pyt/names/Korean.txt', '../../dat/pyt/names/Polish.txt']


In [83]:
import unicodedata
import string

In [84]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [85]:
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"

In [86]:
# Turn a Unicode string to plain ASCII.
def unicodeToASCII(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn' and c in all_letters
    )

In [87]:
s = 'Lê Hồng Phương'

In [88]:
s

'Lê Hồng Phương'

In [89]:
len(s)

14

In [90]:
unicodeToASCII(s)

'Le Hong Phuong'

In [91]:
len(unicodeToASCII(s))

14

In [92]:
u = unicodedata.normalize('NFD', s)

In [93]:
u

'Lê Hồng Phương'

In [94]:
len(u)

19

In [95]:
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToASCII(line) for line in lines]

In [96]:
languages = []
data = {} # a dictionary lang -> samples

In [97]:
for filename in findFiles("../../dat/pyt/names/*.txt"):
    lang = os.path.splitext(os.path.basename(filename))[0]
    languages.append(lang)
    lines = readLines(filename)
    data[lang] = lines

In [98]:
os.path.splitext(os.path.basename("../../dat/pyt/names/Vietnamese.txt"))

('Vietnamese', '.txt')

In [99]:
languages

['Czech',
 'German',
 'Arabic',
 'Japanese',
 'Chinese',
 'Vietnamese',
 'Russian',
 'French',
 'Irish',
 'English',
 'Spanish',
 'Greek',
 'Italian',
 'Portuguese',
 'Scottish',
 'Dutch',
 'Korean',
 'Polish']

In [100]:
data

{'Czech': ['Abl',
  'Adsit',
  'Ajdrna',
  'Alt',
  'Antonowitsch',
  'Antonowitz',
  'Bacon',
  'Ballalatak',
  'Ballaltick',
  'Bartonova',
  'Bastl',
  'Baroch',
  'Benesch',
  'Betlach',
  'Biganska',
  'Bilek',
  'Blahut',
  'Blazek',
  'Blazek',
  'Blazejovsky',
  'Blecha',
  'Bleskan',
  'Blober',
  'Bock',
  'Bohac',
  'Bohunovsky',
  'Bolcar',
  'Borovka',
  'Borovski',
  'Borowski',
  'Borovsky',
  'Brabbery',
  'Brezovjak',
  'Brousil',
  'Bruckner',
  'Buchta',
  'Cablikova',
  'Camfrlova',
  'Cap',
  'Cerda',
  'Cermak',
  'Chermak',
  'Cermak',
  'Cernochova',
  'Cernohous',
  'Cerny',
  'Cerney',
  'Cerny',
  'Cerv',
  'Cervenka',
  'Chalupka',
  'Charlott',
  'Chemlik',
  'Chicken',
  'Chilar',
  'Chromy',
  'Cihak',
  'Clineburg',
  'Klineberg',
  'Cober',
  'Colling',
  'Cvacek',
  'Czabal',
  'Damell',
  'Demall',
  'Dehmel',
  'Dana',
  'Dejmal',
  'Dempko',
  'Demko',
  'Dinko',
  'Divoky',
  'Dolejsi',
  'Dolezal',
  'Doljs',
  'Dopita',
  'Drassal',
  'Driml',
  

In [101]:
data['Italian'][:5]

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']

In [102]:
data['Vietnamese'][:5]

['Nguyen', 'Tron', 'Le', 'Pham', 'Huynh']

In [103]:
import torch

In [104]:
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][all_letters.find(letter)] = 1
    return tensor

In [105]:
v = letterToTensor('c')

In [106]:
v.shape

torch.Size([1, 57])

In [107]:
print(v)

tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])


In [108]:
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for i, letter in enumerate(line):
        tensor[i][0][all_letters.find(letter)] = 1
    return tensor

In [109]:
t = lineToTensor('Phuong')

In [110]:
print(t)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0

In [111]:
t.shape

torch.Size([6, 1, 57])

In [112]:
t.size(2)

57

In [154]:
def languageToTensor(language):
    tensor = torch.tensor([languages.index(language)], dtype=torch.long)
    return tensor

In [149]:
languageToTensor('Vietnamese')

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [113]:
import torch.nn as nn

In [114]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size) -> None:
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [228]:
model = RNN(n_letters, 128, len(languages))

In [229]:
input = letterToTensor('A')
hidden = torch.zeros(1, 128)
output, next_hidden = model(input, hidden)

In [230]:
print(output)

tensor([[-2.9171, -2.8358, -2.9246, -2.9087, -2.9140, -2.8985, -2.9666, -2.8750,
         -2.8396, -2.7980, -2.8937, -2.8301, -2.8188, -2.8729, -2.9527, -2.9389,
         -2.8972, -2.9668]], grad_fn=<LogSoftmaxBackward0>)


In [118]:
output.shape

torch.Size([1, 18])

In [231]:
print(next_hidden)

tensor([[ 0.0405,  0.0475, -0.0137,  0.0643,  0.1068,  0.0016, -0.0011,  0.0037,
         -0.0261, -0.0823,  0.0109, -0.0087, -0.0958,  0.0746,  0.0302,  0.0681,
          0.1069,  0.0656,  0.0325, -0.1056,  0.0325, -0.0817, -0.0333,  0.0210,
         -0.0195,  0.0531, -0.0723, -0.0593, -0.0219,  0.0164,  0.0022,  0.0189,
          0.1422, -0.0892,  0.0132, -0.0076, -0.0549, -0.0346,  0.0550,  0.0364,
          0.0485,  0.0620, -0.0495,  0.0192, -0.0543, -0.0535,  0.0008, -0.0410,
         -0.0306, -0.0579, -0.1368, -0.0675,  0.1024,  0.0064,  0.0442, -0.0359,
          0.0808,  0.1159,  0.1261, -0.0501, -0.0581, -0.1128, -0.0424,  0.0527,
          0.0595, -0.0670,  0.0312, -0.0189, -0.0325,  0.0829,  0.0081, -0.0068,
          0.0929,  0.0673,  0.0068, -0.0827, -0.0519, -0.0185,  0.0367, -0.0095,
          0.1074,  0.0731, -0.0252,  0.0648,  0.0328,  0.0591, -0.0743,  0.0021,
         -0.0195, -0.0807, -0.1177,  0.0971, -0.0224, -0.0930, -0.0487, -0.0355,
         -0.0477, -0.0024, -

In [120]:
next_hidden.shape

torch.Size([1, 128])

In [259]:
input = lineToTensor('Phuong')
hidden = torch.zeros(1, 128)
output, hidden = model(input[0], hidden)

In [233]:
print(output)

tensor([[-2.8419, -2.8491, -2.9617, -2.9009, -2.9494, -2.8817, -2.9186, -2.8863,
         -2.8935, -2.8604, -2.9641, -2.8055, -2.8349, -2.8515, -2.8934, -2.8988,
         -2.8493, -3.0093]], grad_fn=<LogSoftmaxBackward0>)


In [234]:
output.topk(1)

torch.return_types.topk(
values=tensor([[-2.8055]], grad_fn=<TopkBackward0>),
indices=tensor([[11]]))

In [235]:
output.topk(1)[0].item()

-2.8055264949798584

In [260]:
print(hidden)

tensor([[-0.3868,  0.1225, -0.0246, -0.1363, -0.2701, -0.2258, -0.0408,  0.1927,
         -0.1347,  0.0782,  0.2027,  0.1407, -0.1182,  0.5260,  0.2913,  0.2850,
          0.0998,  0.0575,  0.4000,  0.3232,  0.0612,  0.2025,  0.1150, -0.1345,
         -0.0162,  0.1548,  0.0379, -0.2861, -0.1548, -0.0118,  0.2870, -0.0629,
         -0.1209, -0.1438, -0.0450,  0.0398, -0.2042,  0.1619,  0.0425,  0.1069,
          0.1575, -0.0797, -0.3800, -0.1232, -0.0110,  0.0501, -0.0970, -0.0498,
          0.1175, -0.4884,  0.1898,  0.4579,  0.2908,  0.3213,  0.2665,  0.0349,
          0.2065, -0.0664, -0.1896, -0.0888,  0.0839,  0.0493, -0.0077,  0.4190,
          0.1654,  0.4284, -0.2296, -0.0124, -0.0829,  0.3440,  0.0530, -0.0659,
         -0.2350,  0.0793, -0.4259,  0.0669, -0.0639, -0.1167,  0.3456,  0.3432,
         -0.1971, -0.1892,  0.2965,  0.0647, -0.0034, -0.1279, -0.1134, -0.0154,
          0.2851, -0.1598, -0.0233,  0.3794, -0.2541, -0.0390, -0.1028, -0.3347,
          0.0524, -0.1027, -

In [236]:
optimizer = torch.optim.Adam(model.parameters())

In [237]:
criterion = nn.NLLLoss()

In [204]:
samples = []
for language in data.keys():
    for line in data[language]:
        samples.append((lineToTensor(line), languageToTensor(language)))

In [223]:
len(samples)

20074

In [129]:
import time

In [263]:
def train(samples, model, criterion):
    model.train()
    start_time = time.time()
    total_loss = 0
    for i, (x, y) in enumerate(samples):
        hidden = model.initHidden()
        optimizer.zero_grad()
        for t in range(x.size(0)):
            z, hidden = model(x[t], hidden)
        loss = criterion(z, y)
        loss.backward()
        optimizer.step()
        total_loss += loss
        if i % 2000 == 0:
            print(f"total loss at {i:>5d}: {total_loss:>7f}")
    elapsed = time.time() - start_time
    print(f"total loss: {total_loss:>7f}, elapsed: {elapsed:>4f}")


In [187]:
def test(samples, model, criterion):
    N = len(samples)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for (x, y) in samples:
            hidden = model.initHidden()
            for t in range(x.size(0)):
                z, hidden = model(x[t], hidden)
            test_loss += criterion(z, y).item()
            correct += (z.argmax(1) == y).type(torch.float).item()
    test_loss /= N
    correct /= N
    print(f"Accuracy: {(100*correct):>0.2f}%, Avg loss: {test_loss:>8f}")


In [264]:
for _ in range(1):
    train(samples, model, criterion)
    
test(samples, model, criterion)

total loss at     0: 3.245190
total loss at  2000: 1178.892334
total loss at  4000: 1493.779785
total loss at  6000: 2054.600586
total loss at  8000: 2063.560791
total loss at 10000: 2065.094238
total loss at 12000: 2065.460938
total loss at 14000: 2195.788818
total loss at 16000: 3761.565918
total loss at 18000: 3775.253662
total loss at 20000: 8375.337891
total loss: 8384.734375, elapsed: 33.407768
Accuracy: 2.08%, Avg loss: 7.320883


In [227]:
model

RNN(
  (i2h): Linear(in_features=185, out_features=128, bias=True)
  (i2o): Linear(in_features=185, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)

In [244]:
x, y = samples[0]
hidden = model.initHidden()
for t in range(x.size(0)):
    z, hidden = model(x[t], hidden)

In [245]:
z

tensor([[-3.4170, -2.8483, -3.4847, -3.3513, -3.1325, -3.9737, -3.2984, -2.9445,
         -2.4046, -2.3736, -3.5120, -2.7315, -1.9360, -2.6938, -2.5809, -2.7322,
         -3.4644, -3.5344]], grad_fn=<LogSoftmaxBackward0>)

In [246]:
criterion(z, y)

tensor(3.4170, grad_fn=<NllLossBackward0>)

In [194]:
from torch.utils.data import Dataset, DataLoader

In [209]:
# create a custom Dataset object to feed into a data loader
class NameDataSet(Dataset):
    def __init__(self, data) -> None:
        self.samples = []
        for language in data.keys():
            for line in data[language]:
                self.samples.append((lineToTensor(line), languageToTensor(language)))
    def __getitem__(self, index):
        return self.samples[index]
    def __len__(self):
        return len(self.samples)

In [212]:
dataset = NameDataSet(data)
dataloader = DataLoader(dataset, batch_size=16)

In [218]:
len(dataset)

20074

In [216]:
len(dataloader)

1255