In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

In [7]:

def findFiles(path): return glob.glob(path)

print(findFiles('data/names/*.txt'))

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters) # 所有可能出现的ascii码一共有57种

print(all_letters,n_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {} # 语言->[人名列表]
all_categories = [] # 有哪些语言

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0] # English,Chinese
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)
print(n_categories) # 18个语言（也就是18个分类）
print(category_lines) # 打印每个语言对应的训练人名
print(category_lines['Italian'][:5])
print(category_lines['Chinese'][:2])

['data/names\\Arabic.txt', 'data/names\\Chinese.txt', 'data/names\\Czech.txt', 'data/names\\Dutch.txt', 'data/names\\English.txt', 'data/names\\French.txt', 'data/names\\German.txt', 'data/names\\Greek.txt', 'data/names\\Irish.txt', 'data/names\\Italian.txt', 'data/names\\Japanese.txt', 'data/names\\Korean.txt', 'data/names\\Polish.txt', 'data/names\\Portuguese.txt', 'data/names\\Russian.txt', 'data/names\\Scottish.txt', 'data/names\\Spanish.txt', 'data/names\\Vietnamese.txt']
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;' 57
Slusarski
18
{'Arabic': ['Khoury', 'Nahas', 'Daher', 'Gerges', 'Nazari', 'Maalouf', 'Gerges', 'Naifeh', 'Guirguis', 'Baba', 'Sabbagh', 'Attia', 'Tahan', 'Haddad', 'Aswad', 'Najjar', 'Dagher', 'Maloof', 'Isa', 'Asghar', 'Nader', 'Gaber', 'Abboud', 'Maalouf', 'Zogby', 'Srour', 'Bahar', 'Mustafa', 'Hanania', 'Daher', 'Tuma', 'Nahas', 'Saliba', 'Shamoon', 'Handal', 'Baba', 'Amari', 'Bahar', 'Atiyeh', 'Said', 'Khouri', 'Tahan', 'Baba', 'Mustafa', 'Guirguis',

In [9]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter) # all_letters="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;' 57"

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters) # (1,57) -> [[0,0,0,1,0,0,0....,一共57位]]
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

# 1个字母转tensor
print(letterToTensor('J'))
# 1个名字转tensor
print(lineToTensor('Jones').size())
print(lineToTensor('Jones'))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0.,

In [None]:
# 普通的神经网络缺陷
#'Jones'->'good', 'onesJ'->'bad'
# [0,1,0,1,0...,1,0,0,1,.一共57种字母]

In [12]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [14]:
# Jones的推理和训练怎么做呢？
hidden = torch.zeros(1, n_hidden) # 为Jones初始化全0状态向量

input = letterToTensor('J')
output, hidden = rnn(input, hidden) # 
input = letterToTensor('o')
output, hidden = rnn(input, hidden) 
input = letterToTensor('n')
output, hidden = rnn(input, hidden) 
input = letterToTensor('e')
output, hidden = rnn(input, hidden) 
input = letterToTensor('s')
output, hidden = rnn(input, hidden)
print(output)

# 另一种写法，和上面等价
input = lineToTensor('Jones')
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-2.8469, -2.8954, -3.0145, -2.8834, -2.9356, -2.7724, -2.8314, -2.8380,
         -2.9583, -2.8276, -2.9015, -2.8316, -2.9323, -2.9455, -3.0117, -2.9435,
         -2.8696, -2.8265]], grad_fn=<LogSoftmaxBackward0>)
tensor([[-0.0567, -0.0853, -0.1870,  0.0089, -0.0116, -0.0080, -0.0646,  0.0588,
         -0.0751, -0.0314, -0.1493, -0.1136, -0.0387, -0.0810, -0.0424, -0.0122,
         -0.0274, -0.0029,  0.0169,  0.0322,  0.0660, -0.1435,  0.0248,  0.0738,
         -0.0261,  0.0099, -0.0089, -0.0433, -0.0055,  0.0083, -0.1122, -0.0703,
         -0.0006, -0.1096,  0.0042, -0.0264,  0.1645,  0.0073,  0.0080, -0.0402,
          0.0801, -0.1038, -0.0621, -0.0033,  0.0049, -0.1393, -0.0188, -0.0611,
          0.0974,  0.0071, -0.0168,  0.0715,  0.0235, -0.0496, -0.1015,  0.0308,
         -0.0691, -0.0249,  0.0793,  0.0820, -0.0556,  0.0735,  0.0728, -0.0334,
         -0.0041,  0.1374,  0.0435,  0.1776, -0.0881,  0.0465, -0.0976,  0.0615,
          0.0843, -0.0384, -0.0458, -0.0323, -0.0