In [5]:
# utils
import string
import numpy as np
import os
import glob
import unicodedata

# data and model
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

# training
import tqdm
import torch.utils.tensorboard as tensorboard

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
# device

### Data Preprocessing

In [9]:
all_letters  = string.ascii_letters + ".I-';,`!@#$%^&*"
n_letters = len(all_letters)

In [10]:
def get_files(path):
    return glob.glob(path)
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [11]:
# names in different languges
files = get_files('../data/names/*.txt')

# read the names
data = {}
for file in files:
    data[file.split("/")[-1].split(".")[0]] = list(map(unicodeToAscii, open(file).read().splitlines()))

# language
classes = list(data.keys())

In [12]:
# create cls2idx and idx2cls dictionary 
cls2idx = dict(zip(classes, np.linspace(start=0, stop=len(classes)-1, num=len(classes), dtype=int)))
idx2cls = dict(zip(cls2idx.values(), cls2idx.keys()))

In [13]:
def char2Idx(char):
    return all_letters.find(char)

def char2Tensor(char):
    tensor = torch.zeros((1, n_letters))
    tensor[0][char2Idx(char)] = 1
    return tensor

def name2Tensor(name):
    tensor = torch.zeros((len(name), 1, n_letters))
    for cidx, char in enumerate(name):
        tensor[cidx][0][char2Idx(char)] = 1
    return tensor


### Dataset

In [14]:
BATCH_SIZE = 128

In [15]:
inputs = []
labels = []
for cls in classes:
    i = data[cls]
    l = [cls2idx[cls]]*len(i)
    inputs = inputs + i
    labels = labels + l

In [16]:
class CustomDataset(Dataset):
    
    def __init__(self, inputs, labels):
        self.x = inputs
        self.y = labels
        self.len = len(self.x)
        
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return self.len
        

In [17]:
dataset = CustomDataset(inputs, labels)

In [18]:
data_loader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)

In [35]:
# len(data_loader)

### Model

In [37]:
class RNN(nn.Module):
    
    def __init__(self, n_letters, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(in_features=n_letters+hidden_size, out_features=hidden_size)
        self.i2o = nn.Linear(in_features=n_letters+hidden_size, out_features=output_size)
        self.activation = nn.ReLU()

        
    def forward(self, input, hidden):
        # input.size() -> len(name), 1, n_letters
        combined = torch.cat((input, hidden), dim=1)
       
        hidden = self.activation(self.i2h(combined))
        outputs = self.activation(self.i2o(combined))
        return hidden, outputs
    
    
    def init_hidden(self):
        return torch.zeros((1, self.hidden_size))

In [39]:
model = RNN(
    n_letters=n_letters,
    hidden_size=64,
    output_size=len(classes)
)

In [57]:
def fit(model, input):
    hidden = model.init_hidden()
    for i in range(input.size(0)):
        hidden, outputs = model(input[i], hidden)
    return nn.Softmax()(outputs)  

def accuracy(y, y_):
    correct = 0
    for i in range(y.size(0)):
        if y[i]==y_[i]:
            correct += 1
    return correct/y.size(0)

In [67]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.01)

In [68]:
epochs = 20
epoch_progress = tqdm.tqdm(total=epochs, desc="Epoch", position=0)
steps = 0

for epoch in range(epochs):
    
    for x, y in data_loader:
        outputs = torch.empty((len(x), len(classes)))
        for i, name in enumerate(x):
            outputs[i] = fit(model, name2Tensor(name))
        
        loss = criterion(outputs, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if steps % 100==0:
            print(f'Epoch {epoch} | Steps {steps} | Loss {loss.item():.4f}')
        steps += 1
    
    epoch_progress.update(1)
    

  """


Epoch 0 | Steps 0 | Loss 2.7250
Epoch 0 | Steps 100 | Loss 2.4845


Epoch:   5%|▌         | 1/20 [00:27<08:33, 27.04s/it]

Epoch 1 | Steps 200 | Loss 2.5588
Epoch 1 | Steps 300 | Loss 2.4857


Epoch:  10%|█         | 2/20 [00:54<08:09, 27.18s/it]

Epoch 2 | Steps 400 | Loss 2.6403


Epoch:  15%|█▌        | 3/20 [01:24<07:53, 27.87s/it]

Epoch 3 | Steps 500 | Loss 2.7297
Epoch 3 | Steps 600 | Loss 2.4694


Epoch:  20%|██        | 4/20 [01:53<07:34, 28.41s/it]

Epoch 4 | Steps 700 | Loss 2.3178


Epoch:  25%|██▌       | 5/20 [02:20<07:00, 28.06s/it]

Epoch 5 | Steps 800 | Loss 2.1893
Epoch 5 | Steps 900 | Loss 2.2865


Epoch:  30%|███       | 6/20 [02:45<06:19, 27.09s/it]

Epoch 6 | Steps 1000 | Loss 2.2263


Epoch:  35%|███▌      | 7/20 [03:09<05:40, 26.19s/it]

Epoch 7 | Steps 1100 | Loss 2.2564
Epoch 7 | Steps 1200 | Loss 2.3309


Epoch:  40%|████      | 8/20 [03:33<05:04, 25.39s/it]

Epoch 8 | Steps 1300 | Loss 2.2182
Epoch 8 | Steps 1400 | Loss 2.2685


Epoch:  45%|████▌     | 9/20 [03:57<04:35, 25.07s/it]

Epoch 9 | Steps 1500 | Loss 2.2113


Epoch:  50%|█████     | 10/20 [04:23<04:13, 25.36s/it]

Epoch 10 | Steps 1600 | Loss 2.3060
Epoch 10 | Steps 1700 | Loss 2.2424


Epoch:  55%|█████▌    | 11/20 [04:49<03:50, 25.60s/it]

Epoch 11 | Steps 1800 | Loss 2.2691


Epoch:  60%|██████    | 12/20 [05:16<03:27, 25.88s/it]

Epoch 12 | Steps 1900 | Loss 2.2412
Epoch 12 | Steps 2000 | Loss 2.2954


Epoch:  65%|██████▌   | 13/20 [05:40<02:57, 25.33s/it]

Epoch 13 | Steps 2100 | Loss 2.2227


Epoch:  70%|███████   | 14/20 [06:06<02:33, 25.56s/it]

Epoch 14 | Steps 2200 | Loss 2.2734
Epoch 14 | Steps 2300 | Loss 2.2825


Epoch:  75%|███████▌  | 15/20 [06:32<02:08, 25.63s/it]

Epoch 15 | Steps 2400 | Loss 2.2720
Epoch 15 | Steps 2500 | Loss 2.2853


Epoch:  80%|████████  | 16/20 [06:57<01:41, 25.49s/it]

Epoch 16 | Steps 2600 | Loss 2.2383


Epoch:  85%|████████▌ | 17/20 [07:21<01:15, 25.02s/it]

Epoch 17 | Steps 2700 | Loss 2.2483
Epoch 17 | Steps 2800 | Loss 2.2147


Epoch:  90%|█████████ | 18/20 [07:45<00:49, 24.59s/it]

Epoch 18 | Steps 2900 | Loss 2.1883


Epoch:  95%|█████████▌| 19/20 [08:08<00:24, 24.31s/it]

Epoch 19 | Steps 3000 | Loss 2.2287
Epoch 19 | Steps 3100 | Loss 2.2353


Epoch: 100%|██████████| 20/20 [08:32<00:00, 24.11s/it]

In [75]:
idx2cls[y[i].item()]

'Russian'

In [76]:
name

'Balarev'

In [78]:
out = fit(model, name2Tensor(name))

  """


In [79]:
torch.argmax(out, 1)

tensor([7])