In [1]:
import torch 

device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

torch.set_default_device(device) 
print(f"using device {torch.get_default_device()}")

using device cuda:0


In [2]:
import string 
import unicodedata 

allowed_characters = string.ascii_letters + " .,;'" + "_"
n_letters = len(allowed_characters) 

def unicodeToAscii(s): 
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

In [3]:
print (f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}")

converting 'Ślusàrski' to Slusarski


In [4]:
def letterToIndex(letter):
    if letter not in allowed_characters: 
        return allowed_characters.find('_')
    else: 
        return allowed_characters.find(letter) 

def lineToTensor(line): 
    tensor = torch.zeros(1, len(line), n_letters)
    for li, letter in enumerate(line):
        tensor[0][li][letterToIndex(letter)] = 1
    return tensor

In [5]:
print (f"The letter 'a' becomes {lineToTensor('a')}") #notice that the first position in the tensor = 1
print (f"The name 'Ahn' becomes {lineToTensor('Ahn')}") #notice 'A' sets the 27th index to 1

The letter 'a' becomes tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]], device='cuda:0')
The name 'Ahn' becomes tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 

In [6]:
from io import open 
import glob 
import os 
import time 

from torch.utils.data import Dataset 

class NamesDataset(Dataset): 

    def __init__(self, data_dir):
        self.data_dir = data_dir 
        self.load_time = time.localtime 
        labels_set = set()

        self.data = [] 
        self.data_tensors = []
        self.labels = []
        self.labels_tensors = []

        text_files = glob.glob(os.path.join(data_dir, '*.txt'))
        for filename in text_files: 
            label = os.path.splitext(os.path.basename(filename))[0]
            labels_set.add(label) 
            lines = open(filename, encoding='utf-8').read().strip().split('\n')
            for name in lines: 
                self.data.append(name)
                self.data_tensors.append(lineToTensor(name))
                self.labels.append(label)

        self.labels_uniq = list(labels_set)
        for label in self.labels:
            temp_tensor = torch.tensor(self.labels_uniq.index(label))
            self.labels_tensors.append(temp_tensor)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_item = self.data[idx]
        label_item = self.labels[idx]
        data_tensor = self.data_tensors[idx]
        label_tensor = self.labels_tensors[idx]
            
        return data_item, label_item, data_tensor, label_tensor

In [7]:
alldata = NamesDataset("data/names/")
print(f"loaded {len(alldata)} items of data")
print(f"example = {alldata[0]}")

loaded 20074 items of data
example = ('Khoury', 'Arabic', tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0.,

In [8]:
train_set, test_set = torch.utils.data.random_split(alldata, [0.85, 0.15], generator=torch.Generator(device=device).manual_seed(2026))

print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")

train examples = 17063, validation examples = 3011


In [19]:
import torch.nn as nn
import torch.nn.functional as F

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(CharRNN, self).__init__()

        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, line_tensor):
        rnn_out, hidden = self.rnn(line_tensor)
        out = self.h2o(hidden[0])
        out = self.softmax(out)
        return out

In [20]:
n_hidden = 128
rnn = CharRNN(n_letters, n_hidden, len(alldata.labels_uniq))
print(rnn)

CharRNN(
  (rnn): RNN(58, 128, batch_first=True)
  (h2o): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [21]:
def label_from_output(output, output_labels):
    top_n, top_i = output.topk(1)
    label_i = top_i[0].item()
    return output_labels[label_i], label_i 

input = lineToTensor('Albert')
output = rnn(input) #this is equivalent to ``output = rnn.forward(input)``
print(output)
print(label_from_output(output, alldata.labels_uniq))

tensor([[-2.9745, -2.7938, -2.7299, -2.8898, -2.9408, -2.9859, -3.0303, -2.8996,
         -2.7709, -2.7795, -2.9623, -2.9458, -3.0190, -2.8303, -2.9887, -2.9269,
         -3.0164, -2.6509]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
('English', 17)


In [22]:
import random 
import numpy as np 

def train(rnn, training_data, epochs = 10, n_batch_size = 64, report_every = 50, learning_rate = 0.2, criterion = nn.NLLLoss()):

    current_loss = 0 
    all_losses = []
    rnn.train()
    optimizer = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

    start = time.time()
    print(f"Training on data set with {len(training_data)} samples")

    for epoch in range(epochs):
        rnn.zero_grad()

        batches = list(range(len(training_data)))
        random.shuffle(batches) 
        batches = np.array_split(batches, len(batches)//n_batch_size)

        for idx, batch in enumerate(batches):
            batch_loss = 0
            for i in batch: #for each example in this batch
                (data_item, label_item, data_tensor, label_tensor) = training_data[i]
                output = rnn.forward(data_tensor)
                loss = criterion(output.squeeze(), label_tensor)
                batch_loss += loss

            # optimize parameters
            batch_loss.backward()
            nn.utils.clip_grad_norm_(rnn.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()

            current_loss += batch_loss.item() / len(batch)

        all_losses.append(current_loss / len(batches) )
        if epoch % report_every == 0:
            print(f"{epoch} ({epoch / epochs:.0%}): \t average batch loss = {all_losses[-1]}")
        current_loss = 0

    return all_losses

In [23]:
start = time.time()
all_losses = train(rnn, train_set, epochs=27, learning_rate=0.15, report_every=5)
end = time.time()
print(f"training took {end-start}s")

Training on data set with 17063 samples
0 (0%): 	 average batch loss = 1.3862715135529387
5 (19%): 	 average batch loss = 0.8394300906850535
10 (37%): 	 average batch loss = 0.6712975169396801
15 (56%): 	 average batch loss = 0.5645289322995878
20 (74%): 	 average batch loss = 0.49029476373974057
25 (93%): 	 average batch loss = 0.43026199103944424
training took 1698.2123827934265s


In [26]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def evaluate(rnn, testing_data, classes):
    confusion = torch.zeros(len(classes), len(classes))

    rnn.eval() #set to eval mode
    with torch.no_grad(): # do not record the gradients during eval phase
        for i in range(len(testing_data)):
            (data_item, label_item, data_tensor, label_tensor) = testing_data[i]
            output = rnn(data_tensor)
            guess, guess_i = label_from_output(output, classes)
            label_i = classes.index(label_item)
            confusion[label_i][guess_i] += 1

    # Normalize by dividing every row by its sum
    for i in range(len(classes)):
        denom = confusion[i].sum()
        if denom > 0:
            confusion[i] = confusion[i] / denom

    # Set up plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(confusion.cpu().numpy()) #numpy uses cpu here so we need to use a cpu version
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticks(np.arange(len(classes)), labels=classes, rotation=90)
    ax.set_yticks(np.arange(len(classes)), labels=classes)

    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    # sphinx_gallery_thumbnail_number = 2
    plt.show()



evaluate(rnn, test_set, classes=alldata.labels_uniq)


: 

In [2]:
import unicodedata

mn_chars = []
for i in range(0x10000):  # Basic Multilingual Plane
    char = chr(i)
    if unicodedata.category(char) == 'Mn':
        mn_chars.append((hex(i), char, unicodedata.name(char, 'UNNAMED')))

# Show first 50
for code, char, name in mn_chars[:50]:
    print(f"{code}: '{char}' - {name}")

0x300: '̀' - COMBINING GRAVE ACCENT
0x301: '́' - COMBINING ACUTE ACCENT
0x302: '̂' - COMBINING CIRCUMFLEX ACCENT
0x303: '̃' - COMBINING TILDE
0x304: '̄' - COMBINING MACRON
0x305: '̅' - COMBINING OVERLINE
0x306: '̆' - COMBINING BREVE
0x307: '̇' - COMBINING DOT ABOVE
0x308: '̈' - COMBINING DIAERESIS
0x309: '̉' - COMBINING HOOK ABOVE
0x30a: '̊' - COMBINING RING ABOVE
0x30b: '̋' - COMBINING DOUBLE ACUTE ACCENT
0x30c: '̌' - COMBINING CARON
0x30d: '̍' - COMBINING VERTICAL LINE ABOVE
0x30e: '̎' - COMBINING DOUBLE VERTICAL LINE ABOVE
0x30f: '̏' - COMBINING DOUBLE GRAVE ACCENT
0x310: '̐' - COMBINING CANDRABINDU
0x311: '̑' - COMBINING INVERTED BREVE
0x312: '̒' - COMBINING TURNED COMMA ABOVE
0x313: '̓' - COMBINING COMMA ABOVE
0x314: '̔' - COMBINING REVERSED COMMA ABOVE
0x315: '̕' - COMBINING COMMA ABOVE RIGHT
0x316: '̖' - COMBINING GRAVE ACCENT BELOW
0x317: '̗' - COMBINING ACUTE ACCENT BELOW
0x318: '̘' - COMBINING LEFT TACK BELOW
0x319: '̙' - COMBINING RIGHT TACK BELOW
0x31a: '̚' - COMBINING LEFT