In [1]:
import torch
import torch.nn as nn

In [2]:
filepath = 'vanznames.txt'

In [3]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
import unicodedata

with open(filepath, 'r') as f:
    names_raw = list(map(lambda x: x.replace('\n', ''), f.readlines()))
    
all_letters = list(set("".join(names_raw)))
n_letters = len(all_letters) + 1 # including EOS

def encode_name(name):
    return [all_letters.index(s) for s in name]

names_enc = [encode_name(name) for name in names_raw]

In [4]:
print(n_letters)
print(names_enc[0])

159
[35, 126, 97, 112, 124, 67, 105, 140, 15, 132, 140, 100, 140, 148, 100, 35, 140, 67, 114, 121]


In [5]:
# define network
class VanzNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dor=0.15):
        super(VanzNet, self).__init__()
        self.hidden_size = hidden_size
        total_input_size = input_size + hidden_size
        self.i2h = nn.Linear(total_input_size, hidden_size)
        self.i2o = nn.Linear(total_input_size, output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(dor)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        input_combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [6]:
# one-hot matrix for characters
def name2input(name_enc):
    """    
    dim: [len_name * 1 * n_letters]
    e.g. KASPAROV -> [OneHot(K), OneHot(A), ..., OneHot(V)]
    """
    tensor = torch.zeros(len(name_enc), 1, n_letters)
    for i, n in enumerate(name_enc):
        tensor[i][0][n] = 1
    return tensor

def name2target(name_enc):
    """
    dim: [len_name]
    e.g. target(KASPAROV) -> ASPAROV<EOS> -> [Idx(A), Idx(S), ..., Idx(EOS)]
    """
    return torch.LongTensor(name_enc[1:] + [n_letters - 1])

In [None]:
lr = 0.0005
epochs = 5000
print_every = 50
max_training_size = 100

if max_training_size > 0:
    names_train = names_enc[:max_training_size]
else:
    names_train = names_enc

rnn = VanzNet(n_letters, 128, n_letters)
criterion = nn.NLLLoss()
optim = torch.optim.Adam(rnn.parameters(), lr=lr)
losses = []
for epoch in range(epochs):    
    loss_epoch = 0
    for name in names_train:
        input_tensor = name2input(name)
        target_tensor = name2target(name)
        target_tensor.unsqueeze_(-1)
        # print(input_tensor)
        # print(target_tensor)
        hidden = rnn.initHidden()        
        optim.zero_grad()
        loss = 0
        for i in range(input_tensor.size(0)):
            output, hidden = rnn(input_tensor[i], hidden)
            loss += criterion(output, target_tensor[i])
            loss_epoch += loss
        loss.backward()
        optim.step()
        
    losses.append(loss_epoch / len(names_train))
    
    if (epoch + 1) % print_every == 0:
        print("%d/%d: Loss %f" % (epoch+1, epochs, loss_epoch))

50/5000: Loss 25843.703125
100/5000: Loss 16845.751953
150/5000: Loss 12976.521484
200/5000: Loss 12745.041992
250/5000: Loss 10902.885742
300/5000: Loss 11982.556641
350/5000: Loss 10998.518555
400/5000: Loss 10979.769531
450/5000: Loss 13333.736328
500/5000: Loss 12522.534180
550/5000: Loss 10985.856445
600/5000: Loss 12639.764648
650/5000: Loss 11052.083984
700/5000: Loss 13228.370117
750/5000: Loss 11985.356445
800/5000: Loss 12747.591797
850/5000: Loss 11505.390625
900/5000: Loss 12674.726562
950/5000: Loss 11368.822266
1000/5000: Loss 11612.627930
1050/5000: Loss 16415.802734
1100/5000: Loss 11769.797852
1150/5000: Loss 11546.096680
1200/5000: Loss 11653.357422
1250/5000: Loss 13433.497070
1300/5000: Loss 17556.208984
1350/5000: Loss 11598.698242


In [None]:
def reconstruct_char(output):
    topv, topi = output.topk(1)
    idx = topi[0][0]
    if idx == n_letters - 1:
        return 'EOS'
    else:
        return all_letters[idx]

def sample_name(start_char):
    name_sample = start_char
    if not (start_char in all_letters):
        return "Invalid start character!"
    else:
        start_char_enc = [all_letters.index(start_char)]
        input_tensor = name2input(start_char_enc)
        hidden = rnn.initHidden()
        out_char = ""
        
        while out_char != 'EOS':            
            output, hidden = rnn(input_tensor, hidden)
            out_char = reconstruct_char(output)
            name_sample += out_char
            input_tensor = name2input(out_char)
    
    return name_sample

In [None]:
samplers = ['ก', 'ค', 'ม']

for sp in samplers:
    print(sample_name(sp))