In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

# 选择设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
# data/eng-fra.txt文件：The file is a tab separated list of translation pairs:
# Oh no!	Oh non !
# 每一行是英文句子与对应的法语句子，中间是tab分割
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [24]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip()) # 转小写，去头尾空白
    #print(s)
    s = re.sub(r"([.!?])", r" \1", s) # 符号 替换为 空格+符号    
    #print(s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) # 非a-zA-Z.!?，替换为1个空格 。   意思：只保留预期内的字符，多个空格合并1个空格
    #print(s)
    return s
#print(normalizeString('va !'))
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] # 英文句子->法语句子
    # print(pairs[0])

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs] # 法语句子->英文句子
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs
#readLangs('eng','fra')

Reading lines...
['go .', 'va !']


(<__main__.Lang at 0x1392719d0>,
 <__main__.Lang at 0x1388d3eb0>,
 [['go .', 'va !'],
  ['run !', 'cours !'],
  ['run !', 'courez !'],
  ['wow !', 'ca alors !'],
  ['fire !', 'au feu !'],
  ['help !', 'a l aide !'],
  ['jump .', 'saute .'],
  ['stop !', 'ca suffit !'],
  ['stop !', 'stop !'],
  ['stop !', 'arrete toi !'],
  ['wait !', 'attends !'],
  ['wait !', 'attendez !'],
  ['i see .', 'je comprends .'],
  ['i try .', 'j essaye .'],
  ['i won !', 'j ai gagne !'],
  ['i won !', 'je l ai emporte !'],
  ['oh no !', 'oh non !'],
  ['attack !', 'attaque !'],
  ['attack !', 'attaquez !'],
  ['cheers !', 'sante !'],
  ['cheers !', 'a votre sante !'],
  ['cheers !', 'merci !'],
  ['get up .', 'leve toi .'],
  ['got it !', 'j ai pige !'],
  ['got it !', 'compris !'],
  ['got it ?', 'pige ?'],
  ['got it ?', 'compris ?'],
  ['got it ?', 't as capte ?'],
  ['hop in .', 'monte .'],
  ['hop in .', 'montez .'],
  ['hug me .', 'serre moi dans tes bras !'],
  ['hug me .', 'serrez moi dans vos bras

In [12]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# 只保留英文是上面这种格式开头的样本，丢弃单词数量太多的句子
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [25]:
def prepareData(lang1, lang2, reverse=False):
    # input_lang=fra单词编码集, output_lang=eng单词编码集, pairs=[(法语句子,英文句子),(法语句子,英文句子),....]
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    # 仅保留he is , they are这种样本
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    # 把所有pair过一遍，生成法语和英语各自的word编码
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    # 法语的单词数量
    print(input_lang.name, input_lang.n_words)
    # 英语的单词数量
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs)) # 随机返回一个样本（法语句子,英文句子)

Reading lines...
['go .', 'va !']
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['je vais dechiffrer ca .', 'i m going to figure this out .']


In [33]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    # indexs=[2,3,4,5,1], .view之后[[2],[3],[4],[5],[1]]
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# 参数:(法语句子,英语句子)
# 返回:(法语单词列表，英语单词列表)
tensorsFromPair(pairs[0])

(tensor([[2],
         [3],
         [4],
         [5],
         [1]]),
 tensor([[2],
         [3],
         [4],
         [1]]))

In [27]:
'''
翻译问题不是1个词预测下1个词的简单问题，比如会面临：
1，法语的一个词可能要用2个英语词对应
2，可能法语的2个词对应的英语2个词顺序也不一样
总之翻译问题不是简单的一一映射，
因此需要一个更高级的encoder-decoder模型结构来实现翻译效果.

encoder基于rnn模式将法语句子最终编码成1个向量,作为decode的输入
''' 

'\n翻译问题不是1个词预测下1个词的简单问题，比如会面临：\n1，法语的一个词可能要用2个英语词对应\n2，可能法语的2个词对应的英语2个词顺序也不一样\n总之翻译问题不是简单的一一映射\n'

In [60]:
# encoder把句子编码成词嵌入序列
# decode则根据这个词嵌入序列，从句子开始不断续写下一个单词
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1) # embedded=[1,1,256] 
        output = embedded
        print('emb',output.shape)
        # GRU模型的输入输出格式：https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
        output, hidden = self.gru(output, hidden)  # output=[1,1,256],hidden=[1,1,256]
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# encoder rnn模型
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)

# 输入：法语句子中某个单词的one-hot向量 + RNN隐层
input_tensor,target_tensor=tensorsFromPair(pairs[0])
print('法语句子的单词数量:',input_tensor.shape)
hidden=encoder1.initHidden()

# 前向传播
print('输入:',input_tensor[0].shape,hidden.shape)
output0,hidden=encoder1.forward(input_tensor[0],hidden)
print('输出:',output0.shape,hidden.shape)
output1,hidden=encoder1.forward(input_tensor[1],hidden)
output2,hidden=encoder1.forward(input_tensor[2],hidden)
output3,hidden=encoder1.forward(input_tensor[3],hidden)
output4,hidden=encoder1.forward(input_tensor[4],hidden)
# 1，将输出整理成词嵌入序列，作为decoder的一个输入，用作注意力机制
# 2，将输出的hidden作为decoder的初始hidden
encoder_outputs = torch.zeros(MAX_LENGTH, encoder1.hidden_size, device=device)
encoder_outputs[0]+=output0[0][0]
encoder_outputs[1]+=output1[0][0]
encoder_outputs[2]+=output2[0][0]
encoder_outputs[3]+=output3[0][0]
encoder_outputs[4]+=output4[0][0]
print(encoder_outputs.shape) # 最多能装10个单词的emd向量
print(encoder_outputs) # 少于10个单词的句子会有留下一些全0的emb向量
print(hidden.shape)
print('~~~~~')
# embedding层用法
emb=nn.Embedding(input_lang.n_words, hidden_size)
emb(input_tensor[0]).shape

法语句子的单词数量: torch.Size([5, 1])
输入: torch.Size([1]) torch.Size([1, 1, 256])
emb torch.Size([1, 1, 256])
输出: torch.Size([1, 1, 256]) torch.Size([1, 1, 256])
emb torch.Size([1, 1, 256])
emb torch.Size([1, 1, 256])
emb torch.Size([1, 1, 256])
emb torch.Size([1, 1, 256])
torch.Size([10, 256])
tensor([[ 0.2238,  0.1532, -0.1444,  ..., -0.1928,  0.1430, -0.1892],
        [ 0.2398, -0.1480,  0.0603,  ..., -0.3049, -0.1202, -0.2482],
        [ 0.1624,  0.0540,  0.3950,  ..., -0.2434, -0.3966, -0.1344],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<CopySlices>)
torch.Size([1, 1, 256])
~~~~~


torch.Size([1, 256])