In [1]:
import glob
import unicodedata
from typing import List, Dict

import string

In [2]:
path = "./data/names/*.txt"
letters = string.ascii_letters + " .,;'"


In [3]:
def unicode_to_ascii(name: str, letters: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFKD", name)
        if not unicodedata.combining(c) and c in letters
    )

In [4]:
unicode_to_ascii("Ślusàrski", letters)

'Slusarski'

In [5]:
def get_names(file: str, letters: str) -> List[str]:
    names = open(file, encoding="utf-8").read().strip().split("\n")
    return [unicode_to_ascii(name, letters) for name in names]

In [6]:
files = glob.glob(path)
for file in files:
    print(get_names(file, letters))
    

['Khoury', 'Nahas', 'Daher', 'Gerges', 'Nazari', 'Maalouf', 'Gerges', 'Naifeh', 'Guirguis', 'Baba', 'Sabbagh', 'Attia', 'Tahan', 'Haddad', 'Aswad', 'Najjar', 'Dagher', 'Maloof', 'Isa', 'Asghar', 'Nader', 'Gaber', 'Abboud', 'Maalouf', 'Zogby', 'Srour', 'Bahar', 'Mustafa', 'Hanania', 'Daher', 'Tuma', 'Nahas', 'Saliba', 'Shamoon', 'Handal', 'Baba', 'Amari', 'Bahar', 'Atiyeh', 'Said', 'Khouri', 'Tahan', 'Baba', 'Mustafa', 'Guirguis', 'Sleiman', 'Seif', 'Dagher', 'Bahar', 'Gaber', 'Harb', 'Seif', 'Asker', 'Nader', 'Antar', 'Awad', 'Srour', 'Shadid', 'Hajjar', 'Hanania', 'Kalb', 'Shadid', 'Bazzi', 'Mustafa', 'Masih', 'Ghanem', 'Haddad', 'Isa', 'Antoun', 'Sarraf', 'Sleiman', 'Dagher', 'Najjar', 'Malouf', 'Nahas', 'Naser', 'Saliba', 'Shamon', 'Malouf', 'Kalb', 'Daher', 'Maalouf', 'Wasem', 'Kanaan', 'Naifeh', 'Boutros', 'Moghadam', 'Masih', 'Sleiman', 'Aswad', 'Cham', 'Assaf', 'Quraishi', 'Shalhoub', 'Sabbag', 'Mifsud', 'Gaber', 'Shammas', 'Tannous', 'Sleiman', 'Bazzi', 'Quraishi', 'Rahal', 'Ch

In [7]:
def get_data(path: str, letters: str) -> Dict[str, List[str]]:
    files = glob.glob(path)
    category_name_dict = {}
    for filename in files:
        category = filename.split("\\")[-1].split('.')[0]
        category_name_dict[category] = get_names(filename, letters)
    return category_name_dict

In [8]:
category_name_dict = get_data(path, letters)

In [9]:
category_name_dict

{'Arabic': ['Khoury',
  'Nahas',
  'Daher',
  'Gerges',
  'Nazari',
  'Maalouf',
  'Gerges',
  'Naifeh',
  'Guirguis',
  'Baba',
  'Sabbagh',
  'Attia',
  'Tahan',
  'Haddad',
  'Aswad',
  'Najjar',
  'Dagher',
  'Maloof',
  'Isa',
  'Asghar',
  'Nader',
  'Gaber',
  'Abboud',
  'Maalouf',
  'Zogby',
  'Srour',
  'Bahar',
  'Mustafa',
  'Hanania',
  'Daher',
  'Tuma',
  'Nahas',
  'Saliba',
  'Shamoon',
  'Handal',
  'Baba',
  'Amari',
  'Bahar',
  'Atiyeh',
  'Said',
  'Khouri',
  'Tahan',
  'Baba',
  'Mustafa',
  'Guirguis',
  'Sleiman',
  'Seif',
  'Dagher',
  'Bahar',
  'Gaber',
  'Harb',
  'Seif',
  'Asker',
  'Nader',
  'Antar',
  'Awad',
  'Srour',
  'Shadid',
  'Hajjar',
  'Hanania',
  'Kalb',
  'Shadid',
  'Bazzi',
  'Mustafa',
  'Masih',
  'Ghanem',
  'Haddad',
  'Isa',
  'Antoun',
  'Sarraf',
  'Sleiman',
  'Dagher',
  'Najjar',
  'Malouf',
  'Nahas',
  'Naser',
  'Saliba',
  'Shamon',
  'Malouf',
  'Kalb',
  'Daher',
  'Maalouf',
  'Wasem',
  'Kanaan',
  'Naifeh',
  'Boutro

In [10]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int) -> None:
        super().__init__()
        self.hidden_dim = hidden_dim
        self.i2h = nn.Linear(input_dim + hidden_dim, hidden_dim)
        self.i2o = nn.Linear(input_dim + hidden_dim, output_dim)
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_dim)

    def forward(self, x: Tensor, h: Tensor = None) -> Tensor:
        combined = torch.cat((x, h), 1)
        print(combined.shape)
        hidden = F.tanh(self.i2h(combined))
        print(hidden.shape)
        output = F.tanh(self.i2o(combined))
        print(output.shape)
        return output, hidden

In [11]:
categories = list(category_name_dict.keys())
categories

['Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese']

In [12]:
n_letters = len(letters)
n_hidden = 128
n_categories = len(categories)
rnn = RNN(n_letters, n_hidden, n_categories)

In [13]:
def name_to_tensor(name: str) -> torch.Tensor:
    name_tensor = torch.zeros(len(name), 1, n_letters)
    for i, char in enumerate(name):
        name_tensor[i, 0, letters.index(char)] = 1
    return name_tensor

In [14]:
name_to_tensor("Norden")

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0

In [15]:
class T_RNN(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int) -> None:
        super().__init__()
        self.hidden_dim = hidden_dim
        self.rnn = nn.RNN(input_size=input_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_dim)

    def forward(self, x: torch.Tensor, h: torch.Tensor = None) -> torch.Tensor:
        out, h = self.rnn(x, h)
        out = self.fc(out)
        return out, h

In [16]:
trnn = T_RNN(input_dim=n_letters, hidden_dim=128, output_dim=n_categories)

In [17]:
input = name_to_tensor("Norden")

In [18]:
type(input.shape[0])

int

In [19]:
h = trnn.init_hidden()
x, h = trnn(input[0], h)
print(x), print(x.shape)

tensor([[ 0.0929,  0.1247, -0.0520,  0.0104, -0.0618,  0.1199,  0.1116,  0.0886,
          0.0678, -0.0645, -0.0491, -0.1265,  0.0603, -0.0222, -0.1289, -0.0534,
         -0.0654, -0.0208]], grad_fn=<AddmmBackward0>)
torch.Size([1, 18])


(None, None)

In [20]:
h0 = rnn.init_hidden()
x, h1 = rnn(input[0], h0)
print(x), print(x.shape)

torch.Size([1, 185])
torch.Size([1, 128])
torch.Size([1, 18])
tensor([[ 0.0411, -0.0373,  0.0453, -0.0182, -0.0050,  0.0509, -0.0225,  0.1175,
         -0.1018,  0.0872,  0.1050, -0.0550,  0.0765, -0.1022,  0.0408,  0.0416,
         -0.0127, -0.1063]], grad_fn=<TanhBackward0>)
torch.Size([1, 18])


(None, None)

In [21]:
torch.argmax(x, dim=1).shape

torch.Size([1])

In [22]:
# def category_from_output(output): to 

def output_category(output):
    index = torch.argmax(output, dim=1)
    return categories[index], index.item()

In [23]:
n, a = output_category(F.softmax(x, dim=1))
print(n), print(a)

Greek
7


(None, None)

In [24]:
import random

category = random.randint(0, len(categories) -1)

In [25]:
category_name_dict

{'Arabic': ['Khoury',
  'Nahas',
  'Daher',
  'Gerges',
  'Nazari',
  'Maalouf',
  'Gerges',
  'Naifeh',
  'Guirguis',
  'Baba',
  'Sabbagh',
  'Attia',
  'Tahan',
  'Haddad',
  'Aswad',
  'Najjar',
  'Dagher',
  'Maloof',
  'Isa',
  'Asghar',
  'Nader',
  'Gaber',
  'Abboud',
  'Maalouf',
  'Zogby',
  'Srour',
  'Bahar',
  'Mustafa',
  'Hanania',
  'Daher',
  'Tuma',
  'Nahas',
  'Saliba',
  'Shamoon',
  'Handal',
  'Baba',
  'Amari',
  'Bahar',
  'Atiyeh',
  'Said',
  'Khouri',
  'Tahan',
  'Baba',
  'Mustafa',
  'Guirguis',
  'Sleiman',
  'Seif',
  'Dagher',
  'Bahar',
  'Gaber',
  'Harb',
  'Seif',
  'Asker',
  'Nader',
  'Antar',
  'Awad',
  'Srour',
  'Shadid',
  'Hajjar',
  'Hanania',
  'Kalb',
  'Shadid',
  'Bazzi',
  'Mustafa',
  'Masih',
  'Ghanem',
  'Haddad',
  'Isa',
  'Antoun',
  'Sarraf',
  'Sleiman',
  'Dagher',
  'Najjar',
  'Malouf',
  'Nahas',
  'Naser',
  'Saliba',
  'Shamon',
  'Malouf',
  'Kalb',
  'Daher',
  'Maalouf',
  'Wasem',
  'Kanaan',
  'Naifeh',
  'Boutro

In [26]:
def random_choice(cat_list: list) -> str:
    return cat_list[random.randint(0, len(cat_list)-1)]

In [27]:
def random_sample():
    category = random_choice(categories)
    name = random_choice(category_name_dict[category])
    
    category_tensor = torch.tensor([categories.index(category)], dtype=torch.long)
    name_tensor = name_to_tensor(name)

    return category, name, category_tensor, name_tensor


In [28]:
for i in range(10):
    category, name, category_tensor, name_tensor = random_sample()
    print(f"Category: {category} \t | \t Name: {name}")

Category: Korean 	 | 	 Name: Rim
Category: Polish 	 | 	 Name: Sokolofsky
Category: Dutch 	 | 	 Name: Vennen
Category: Portuguese 	 | 	 Name: Araullo
Category: Irish 	 | 	 Name: O'Connell
Category: Portuguese 	 | 	 Name: Matos
Category: English 	 | 	 Name: Fereday
Category: Japanese 	 | 	 Name: Raikatuji
Category: Polish 	 | 	 Name: Wiater
Category: Italian 	 | 	 Name: Basurto


In [29]:
n_letters, n_hidden, n_categories

(57, 128, 18)