In [79]:
import torch
from torch import nn
import torch.nn.functional as F

import string
import unicodedata
from pathlib import Path

# Loading Data

In [47]:
# name data from: https://download.pytorch.org/tutorial/data.zip

In [106]:
all_chars = string.ascii_letters + " .,;'-"
all_chars

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'-"

In [107]:
def sanitise_line(line):
    # https://stackoverflow.com/a/518232/5013267
    return ''.join(
        c for c in unicodedata.normalize('NFD', line)
        if unicodedata.category(c) != 'Mn'
        and c in all_chars
    )

In [108]:
name_data = {}
for path in Path("names").rglob("*.txt"):
    with open(path, "r", encoding="utf8") as f:
        names = []
        for line in f.read().strip().split("\n"):
            names.append(sanitise_line(line))
        name_data[path.stem] = names
    
    print(f"Loaded: {path}")

Loaded: names/Arabic.txt
Loaded: names/Chinese.txt
Loaded: names/Czech.txt
Loaded: names/Dutch.txt
Loaded: names/English.txt
Loaded: names/French.txt
Loaded: names/German.txt
Loaded: names/Greek.txt
Loaded: names/Irish.txt
Loaded: names/Italian.txt
Loaded: names/Japanese.txt
Loaded: names/Korean.txt
Loaded: names/Polish.txt
Loaded: names/Portuguese.txt
Loaded: names/Russian.txt
Loaded: names/Scottish.txt
Loaded: names/Spanish.txt
Loaded: names/Vietnamese.txt


# Data Preparation

In [175]:
class OneHotTranslator:
    def __init__(self, elements):
        self.elements = elements
        self.n_elements = len(self.elements)
        
    def index_to_vec(self, index):
        return F.one_hot(torch.tensor([index]), num_classes=self.n_elements)[0]
    
    def index_from_vec(self, vec):
        return vec.argmax()
    
    def elm_to_vec(self, elm):
        return self.index_to_vec(self.elements.index(elm))
    
    def elm_from_vec(self, vec):
        return self.elements[int(self.index_from_vec(vec))]
    
    def __len__(self):
        return self.n_elements
    
    def __getitem__(self, val):
        if isinstance(val, int):
            return self.index_to_vec(val)
        else:
            return self.elm_to_vec(val)

In [176]:
category_translator = OneHotTranslator(tuple(name_data.keys()))
print(category_translator.elements)

('Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese')


In [179]:
char_translator = OneHotTranslator(["<SOS>", "<EOS>"] + list(all_chars))
print(char_translator.elements)

['<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ' ', '.', ',', ';', "'", '-']


# Neural

In [192]:
class SeqModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.lstm = nn.LSTMCell(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.05)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, t_prime, t_input, t_hidden, t_cell):
        # t_prime: tensor of batches of vectors for the LSTM to be primed on
        #          shape: (seq_len, N, -)
        #          in this case (priming the LSTM on a category):
        #              shape: (1, N, n_categories)
        #              t_prime[0][0]: one-hot vector encoding a category
        #
        # t_input: shape: (1, N, hidden_size)
        #          t_input[0][0]: one-hot vector encoding of a character
        #
        # t_hidden: shape: (N, hidden_size)
        # t_cell: shape: (N, hidden_size)
        pass
    
    def init_hidden(self, batch_size):
        t_hidden = torch.randn(batch_size, self.hidden_size)
        t_cell = torch.zeros(batch_size, self.hidden_size)
        
        return t_hidden, t_cell

In [193]:
seq = SeqModel(len(char_translator), 32, len(char_translator))

In [197]:
test_category_index = 0

t_prime = category_translator[test_category_index].view((1, 1, len(category_translator)))
print("t_prime shape:\t", t_prime.shape)

t_input = char_translator["<SOS>"].view((1, 1, len(char_translator)))
print("t_input shape:\t", t_input.shape)

t_hidden, t_cell = seq.init_hidden(1)
print("t_hidden shape:\t", t_hidden.shape)
print("t_cell shape:\t", t_cell.shape)

t_prime shape:	 torch.Size([1, 1, 18])
t_input shape:	 torch.Size([1, 1, 60])
t_hidden shape:	 torch.Size([1, 32])
t_cell shape:	 torch.Size([1, 32])
