# 使用 RNN 网络对姓名进行分类

## 准备数据

In [112]:
from pathlib import Path
import unicodedata
import string

In [113]:
#姓氏中所有的字符
#string.ascii_letters是大小写各26字母
all_letters = string.ascii_letters + " .,;'"
#字符的种类数
n_letters = len(all_letters)


# 将Unicode码转换成标准的ASCII码
def unicode_to_ascii(s):
    '''

    :param s: unicode_to_ascill编码的字符串
    :param all_letters: 大小写各26字母 + " .,;'"
    :return:
    '''
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [114]:
print(n_letters) #字符数为57个
print(unicode_to_ascii('Ślusàrski'))

57
Slusarski


1. 构建类别-姓名字典

In [115]:

data_path = Path('../../data/names')
files = list(data_path.glob('*.txt')) # 列出所有txt文件

In [116]:
category_names = {} # dict 类别：姓名
categorys = []

def read_names(file):
    names = open(file).read().strip().split('\n')
    return [unicode_to_ascii(name) for name in names]

for file in files:
    names = read_names(file)
    file_name = file.name[:-4]
    categorys.append(file_name)
    category_names[file_name] = names

In [117]:
category_names['Chinese'][:5]

['Ang', 'AuYong', 'Bai', 'Ban', 'Bao']

2. 将姓名转为tensor向量

In [118]:
import torch
import torch.nn.functional as F

In [119]:
name = 'abdel'
name = [[all_letters.find(char)] for char in name]
name = torch.tensor(name, dtype=torch.int64)
name = F.one_hot(name, num_classes=n_letters)

In [120]:
def name_to_oneHot(name):
    name = [[all_letters.find(char)] for char in name]
    name = torch.tensor(name, dtype=torch.int64)
    name = F.one_hot(name, num_classes=n_letters)
    return name

In [121]:
name = 'Khoury'
name_tensor = name_to_oneHot(name)
name_tensor.size()

torch.Size([6, 1, 57])

3. 构建数据生成器（用函数简单实现）

In [122]:
import random

In [123]:
category = random.choice(categorys) # 随机选一个分类
name = random.choice(category_names[category]) # 从随机分类中随机选一个名字
name_tensor = name_to_oneHot(name)
category_tensor = torch.tensor([categorys.index(category)], dtype=torch.long) # 类别标签

In [124]:
# 训练集，验证集，测试集比例
train_rate = 0.8
val_rate = 0.1
test_rate = 0.1

In [125]:
def get_data_set(type):
    '''
    按type选取训练集，验证集，测试集
    :param type: str: train, val, test
    :return:
    '''
    category = random.choice(categorys) # 随机选一个分类
    # 选择 训练集，验证集，测试集
    if type == 'train':
        names = category_names[category]
        name_len = len(names)
        names = names[:int(name_len * train_rate)]
    elif type == 'val':
        names = category_names[category]
        name_len = len(names)
        names = names[int(name_len * train_rate): int(name_len * (train_rate + val_rate))]
    elif type == 'test':
        names = category_names[category]
        name_len = len(names)
        names = names[int(name_len * (train_rate + val_rate)):]
    return (category, names)

def data_generator(category, names):
    '''
    生成训练数据
    :param category: str 分类
    :param names: list 数据集
    :return:
    '''
    indexs = list(range(len(names))) # 打乱下标，不直接修改names
    random.shuffle(indexs)
    for index in indexs:
        name = names[index] # 从随机分类中随机选一个名字
        name_tensor = name_to_oneHot(name)
        category_tensor = torch.tensor([categorys.index(category)], dtype=torch.long) # 类别标签
        yield (name_tensor, category_tensor)

In [126]:
type = 'train'
(category, names) = get_data_set(type)

In [127]:
i = 0
for X, Y in  data_generator(category, names):
    print(X, Y)
    break

tensor([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0


## 构建模型

1. 定义网络

In [128]:
import torch.nn as nn

In [129]:
class RNN(nn.Module):

    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)

        # self.softMax = nn.Softmax()

    # def forward(self, input, hidden):
    #     # 每次只有一个单词，故而batch_size=1
    #     # input: tensor (batch_size=1, input_size)
    #     # hidden: tensor (batch_size=1, hidden_size)
    #     combined = torch.cat((input, hidden), dim=1) # tensor (batch_size=1, input_size + hidden_size)
    #     output = self.i2o(combined) # (batch_size=1, output_size)
    #     hidden = self.i2h(combined) # (batch_size=1, hidden_size)
    #     return output, hidden

    def forward(self, input, hidden):
        # 每次只有一个单词，故而batch_size=1
        # input: tensor (time_step, batch_size=1, input_size)
        # hidden: tensor (batch_size=1, hidden_size)
        for step in range(input.size()[0]):
            combined = torch.cat((input[step], hidden), dim=1) # tensor (batch_size=1, input_size + hidden_size)
            output = self.i2o(combined) # (batch_size=1, output_size)
            hidden = self.i2h(combined) # (batch_size=1, hidden_size)
        return output, hidden

    def init_hidden(self):
        '''
        初始化隐藏层参数
        :return:
        '''
        return torch.zeros((1, self.hidden_size)) # (batch_size=1, hidden_size)

In [130]:
input_size = 57
hidden_size = 256
output_size = 18

rnn = RNN(input_size,
          hidden_size,
          output_size)

In [131]:
type = 'train'
(category, names) = get_data_set(type)
x, y = next(data_generator(category, names))
hidden = rnn.init_hidden()
output, hidden = rnn(x, hidden)

2. 定义参数


In [145]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
INPUT_SIZE = n_letters # 字母总数
HIDDEN_SIZE = 256
OUTPUT_SIZE = len(categorys) # 类别数
EPOCHS = 200

lr = 1e-3
val_period = 5 # 每50个epoch打印一次
predict_period = 10 # 每

criterion = nn.CrossEntropyLoss() # 交叉熵
optim = torch.optim.Adam(rnn.parameters(), lr=lr)

3. 训练与验证

In [146]:
def train(model):
    '''
    训练模型
    :param model:
    :return:
    '''
    type = 'train'
    (category, names) = get_data_set(type)
    l_sum = 0
    num = 0
    for x, y in data_generator(category, names):
        num += 1
        hidden = model.init_hidden()
        output, hidden = model(x, hidden)
        loss = criterion(output, y)

        optim.zero_grad()
        loss.backward()
        optim.step()
        l_sum += loss.item()
    return output, l_sum / num

In [151]:
def cal_accuracy(model, data_gen):
    '''
    计算误差
    :param model: 模型
    :param data_gen: 数据生成器
    :return:
    '''
    model.eval()
    acc = 0
    num = 0
    for x, y in data_gen:
        hidden = model.init_hidden()
        output, _ = model(x, hidden)
        label = output.argmax(dim=1).item()
        if label == y.item():
            acc += 1
        num += 1
    model.train()
    return acc / num

In [152]:
def predict(model, name, n_predictons=3):
    '''
    给定姓名进行预测
    :param model:
    :param name: str 姓名
    :param n_predictons: int top-K个类别
    :return: list->tuple [(name, value)]
    '''
    hidden = model.init_hidden()
    name_tensor = name_to_oneHot(name)
    output, _ = model(name_tensor, hidden)

    # topv: 下标 tensor (1, n_predictons)
    # topi: 值 tensor (1, n_predictons)
    topv, topi = output.data.topk(n_predictons, 1, True) # 取概率最大的前几个

    prediction = [(categorys[topi[0][index].item()], topv[0][index].item()) for index in range(n_predictons)]
    return prediction

In [153]:
from tqdm import tqdm
import time

In [154]:
rnn = RNN(INPUT_SIZE,
          HIDDEN_SIZE,
          OUTPUT_SIZE)

loss_all = []



# test_gen = data_generator(*get_data_set('test'))

rnn.train()
for epoch in tqdm(range(EPOCHS)):
    start = time.time()
    output, loss = train(rnn)
    print(loss)
    loss_all.append(loss)

    if (epoch + 1) % val_period == 0:
        val_gen = data_generator(*get_data_set('val'))
        acc = cal_accuracy(rnn, val_gen)
        category = random.choice(categorys) # 在测试集中选一个名字，进行预测
        name = random.choice(category_names[category])
        predictions = predict(rnn, name)[0]
        print('epoch %d, acc %f, time %.2f sec, %s(%s) -> %s(%f)'
              %(epoch, acc, time.time() - start, name, category, predictions[0], predictions[1]))


  6%|▋         | 13/200 [00:28<06:55,  2.22s/it]


2.8971176403705203
2.7833542346954347
2.885395543768822
2.9008847932775312
2.903180030287889
7 8
7 8
11 8
7 8
11 8
11 8
1 8
11 8
11 8
7 8
11 8
11 8
7 8
7 8
11 8
11 8
11 8
11 8
8 8
11 8
11 8
7 8
11 8
epoch 4, acc 0.043478, time 2.19 sec, Fearghal(Irish) -> Korean(0.070816)
2.903180030287889
2.885395543768822
2.9060204600856316
2.9008847932775312
2.8950609873081077
11 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
2 2
2 2
7 2
11 2
11 2
11 2
11 2
12 2
11 2
15 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
17 2
11 2
11 2
7 2
7 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
7 2
11 2
7 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
11 2
epoch 9, acc 0.038462, time 0.28 sec, Akrivopoulos(Greek) -> Korean(0.168224)
2.903180030287889
2.9008847932775312
2.885395543768822


KeyboardInterrupt: 