In [34]:
import os
import glob
import string
import unicodedata
from io import open

import torch
import torch.nn as nn

In [2]:
# 使用正则表达式打开目录下所有符合的文件
def find_files(path):
    return glob.glob(path)

print(find_files('../../../data/pytorch_data/names/*.txt'))

['../../../data/pytorch_data/names\\Arabic.txt', '../../../data/pytorch_data/names\\Chinese.txt', '../../../data/pytorch_data/names\\Czech.txt', '../../../data/pytorch_data/names\\Dutch.txt', '../../../data/pytorch_data/names\\English.txt', '../../../data/pytorch_data/names\\French.txt', '../../../data/pytorch_data/names\\German.txt', '../../../data/pytorch_data/names\\Greek.txt', '../../../data/pytorch_data/names\\Irish.txt', '../../../data/pytorch_data/names\\Italian.txt', '../../../data/pytorch_data/names\\Japanese.txt', '../../../data/pytorch_data/names\\Korean.txt', '../../../data/pytorch_data/names\\Polish.txt', '../../../data/pytorch_data/names\\Portuguese.txt', '../../../data/pytorch_data/names\\Russian.txt', '../../../data/pytorch_data/names\\Scottish.txt', '../../../data/pytorch_data/names\\Spanish.txt', '../../../data/pytorch_data/names\\Vietnamese.txt']


In [23]:
# 将Unicode字符转化为纯ascii码
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

all_letters =  string.ascii_letters + '.,;'
n_letters = len(all_letters)

print(unicode_to_ascii('Klüft skräms inför på fédéral électoral große'))

categroy_lines = {}
all_categroies = []


def read_lines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]


for filename in find_files('../../../data/pytorch_data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categroies.append(category)
    
    lines = read_lines(filename)
    categroy_lines[category] = lines
    
n_categroies = len(all_categroies)

Kluftskramsinforpafederalelectoralgroe


In [33]:
# 将名称训练为向量
def letter_to_index(letter):
    """ 将字母转化为索引 """
    return all_letters.find(letter)


def letter_to_tensor(letter):
    """ 通过索引将名称转为one-hot编码 """
    tensor = torch.zeros(1, n_letters)
    tensor[0][letter_to_index[letter]] = 1
    return tensor


def line_to_tensor(line):
    """ 将整行字母转为张量 """
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letter_to_index(letter)] = 1
    return tensor


print(letter_to_index('j'))
print(line_to_tensor('Jones'))

9
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [38]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softamx = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softamx(output)
        
        return output
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)
    
n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categroies)

In [50]:
input = line_to_tensor("cai")
hidden = torch.zeros(1, n_hidden)

output = rnn(input[0], hidden)
print(output)

tensor([[-2.9232, -2.9769, -2.9794, -2.8335, -2.8089, -2.9004, -2.7993, -2.7954,
         -2.9800, -2.9061, -2.8615, -2.8924, -2.9837, -2.9369, -2.7964, -2.9207,
         -2.8737, -2.8951]], grad_fn=<LogSoftmaxBackward>)


In [51]:
def categroy_from_output(output):
    top_n, top_i = output.topk(1)
    categroy_i = top_i[0].item()
    return all_categroies[categroy_i], categroy_i

print(categroy_from_output(output))

('Greek', 7)
