# Seq2Seq

用seq2seq架构实现一个翻译模型，从中文翻译到英文，中文也用空格来分词进行简化

## 定义语料

使用参考书中的语料

In [1]:
sentences = [
    ['咖哥 喜欢 小冰','<sos> KaGe likes XiaoBing','KaGe likes XiaoBing <eos>'],
    ['我 爱 学习 人工智能','<sos> I love studying AI','I love studying AI <eos>'],
    ['深度学习 改变 世界','<sos> DL changed the world','DL changed the world <eos>'],
    ['自然 语言 处理 很 强大','<sos> NLP is so powerful','NLP is so powerful <eos>'],
    ['神经网络 非常 复杂','<sos> Neural-Nets are complex','Neural-Nets are complex <eos>']
]

word_list_cn, word_list_en = [],[]

for s in sentences:
    word_list_cn.extend(s[0].split())
    word_list_en.extend(s[1].split())
    word_list_en.extend(s[2].split())

word_list_cn = list(set(word_list_cn))
word_list_en = list(set(word_list_en))

word2idx_cn = {w:i for i,w in enumerate(word_list_cn)}
word2idx_en = {w:i for i,w in enumerate(word_list_en)}

voc_size_cn = len(word_list_cn)
voc_size_en = len(word_list_en)

print(word_list_cn)
print(word_list_en)
print(word2idx_cn)
print(word2idx_en)

['世界', '我', '强大', '非常', '复杂', '咖哥', '自然', '很', '深度学习', '喜欢', '学习', '人工智能', '改变', '语言', '处理', '小冰', '神经网络', '爱']
['the', 'studying', 'world', 'NLP', 'KaGe', 'powerful', 'I', 'AI', 'complex', 'changed', 'is', 'likes', 'Neural-Nets', 'so', 'DL', 'XiaoBing', 'love', 'are', '<eos>', '<sos>']
{'世界': 0, '我': 1, '强大': 2, '非常': 3, '复杂': 4, '咖哥': 5, '自然': 6, '很': 7, '深度学习': 8, '喜欢': 9, '学习': 10, '人工智能': 11, '改变': 12, '语言': 13, '处理': 14, '小冰': 15, '神经网络': 16, '爱': 17}
{'the': 0, 'studying': 1, 'world': 2, 'NLP': 3, 'KaGe': 4, 'powerful': 5, 'I': 6, 'AI': 7, 'complex': 8, 'changed': 9, 'is': 10, 'likes': 11, 'Neural-Nets': 12, 'so': 13, 'DL': 14, 'XiaoBing': 15, 'love': 16, 'are': 17, '<eos>': 18, '<sos>': 19}


In [2]:
import numpy as np  
import torch
import random

def make_data(sentences):
    random_sentence = random.choice(sentences)
    encoder_input = np.array([[word2idx_cn[n] for n in random_sentence[0].split()]])
    decoder_input = np.array([[word2idx_en[n] for n in random_sentence[1].split()]])
    target = np.array([[word2idx_en[n] for n in random_sentence[2].split()]])

    encoder_input = torch.LongTensor(encoder_input)
    decoder_input = torch.LongTensor(decoder_input)
    target = torch.LongTensor(target)

    return encoder_input,decoder_input,target

encoder_input, decoder_input, target = make_data(sentences)

print(encoder_input)
print(decoder_input)
print(target)

tensor([[ 1, 17, 10, 11]])
tensor([[19,  6, 16,  1,  7]])
tensor([[ 6, 16,  1,  7, 18]])


## 模型

In [3]:
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden


class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output)
        return output, hidden


n_hidden = 128
encoder = Encoder(voc_size_cn, n_hidden)
decoder = Decoder(n_hidden, voc_size_en)

print("编码器结构:", encoder)
print("解码器结构:", decoder)

编码器结构: Encoder(
  (embedding): Embedding(18, 128)
  (rnn): RNN(128, 128, batch_first=True)
)
解码器结构: Decoder(
  (embedding): Embedding(20, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (out): Linear(in_features=128, out_features=20, bias=True)
)
