In [1]:
import math
import os
import random
import sys
from collections import Counter

import jieba
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def load_data(file):
    en = []
    cn = []
    with open(file, "r", encoding='utf-8') as f:
        for line in f:
            line = line.strip().split("\t")
            en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"])
            cn.append(
                ["BOS"] + [c for c in jieba.cut(line[1])] + ["EOS"]
            )  # jieba.cut generator to list
    return en, cn

In [3]:
def build_dict(sentences, max_words=50000):
    UNK_IDX = 0
    PAD_IDX = 1
    word_count = Counter()
    for sentence in sentences:
        for s in sentence:
            word_count[s]+=1
    ls = word_count.most_common(max_words)
    word_dict = {w[0]:index+2 for index, w in enumerate(ls)}
    word_dict["UNK"] = UNK_IDX
    word_dict["PAD"] = PAD_IDX
    total_words = len(ls) + 2
    return word_dict, total_words

In [4]:
def encode(en_sens, cn_sens, en_dict, cn_dict, sort_by_len=True):
    """
    word to number
    """
    out_en_sens = [[en_dict.get(w, 0) for w in en_sen] for en_sen in en_sens]
    out_cn_sens = [[cn_dict.get(w, 0) for w in cn_sen] for cn_sen in cn_sens]
    
    if sort_by_len:
        sorted_index = sorted(range(len(out_en_sens)), key=lambda x: len(out_en_sens[x]))
        out_en_sens = [out_en_sens[i] for i in sorted_index]
        out_cn_sens = [out_cn_sens[i] for i in sorted_index]
    return out_en_sens, out_cn_sens

In [5]:
def get_mini_batches(n, sz, shuffle=True):
    """
    seperate range(n) into batches with size of `sz`
    """
    minibatches=[np.arange(idx, min(idx+sz, n)) for idx in range(0, n, sz)]
    if shuffle:
        np.random.shuffle(minibatches)
    return minibatches

In [13]:
def prepare_data(seqs):
    """
    pading seqs to a matrix
    """
    lengths = torch.tensor([len(seq) for seq in seqs])
    x = [torch.tensor(seq) for seq in seqs]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True)
    return x_padded, lengths

In [14]:
def gen_examples(en_sens, cn_sens, minibatch_size):
    minibatches = get_mini_batches(len(en_sens), minibatch_size)
    all_ex=[]
    for minibatch in minibatches:
        mb_en_sents = [en_sens[t] for t in minibatch]
        mb_cn_sents = [cn_sens[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sents)
        mb_y, mb_y_len = prepare_data(mb_cn_sents)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    return all_ex

In [15]:
train_file = "data/nmt/en-cn/train.txt"
dev_file = "data/nmt/en-cn/dev.txt"
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

In [16]:
en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

In [17]:
train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

In [18]:
k = 10001
print(" ".join([inv_cn_dict[i] for i in train_cn[k]]))
print(" ".join([inv_en_dict[i] for i in train_en[k]]))

BOS 您 做 什麼 工作 為生 ？ EOS
BOS what do you do for a living ? EOS


In [66]:
batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
dev_data = gen_examples(dev_en, dev_cn, batch_size)

### without attention

In [30]:
class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, lengths):
        embedded = self.dropout(self.embed(x))
        # mark the end of the sentence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)

        return out, hid[[-1]]

In [40]:
class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, y, y_lengths, hid):
        embedded = self.dropout(self.embed(y))

        packed_seq = nn.utils.rnn.pack_padded_sequence(embedded, y_lengths, batch_first=True, enforce_sorted=False)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        output = F.log_softmax(self.out(unpacked), -1)
        
        return output, hid

In [41]:
class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid = self.decoder(y, y_lengths, hid)
        return output, None
    
    def translate(self, x, x_lengths, y, max_len=10):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_len):
            output, hid = self.decoder(y, torch.ones(batch_size).long().to(y.device), hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
        return torch.cat(preds, 1), None

In [42]:
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x, target, mask):
        # x: batch_size * seq_len * vocab_size
        x = x.contiguous().view(-1, x.size(2))
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -x.gather(1, target) * mask
        output = torch.sum(output)/torch.sum(mask)
        return output

In [43]:
def load_model(f):
    def wrapper(model, *args, **kwargs):
        PATH = "./saved_model/no_attention.pth"
        if os.path.exists(PATH):
            model.load_state_dict(torch.load(PATH))
        res = f(model, *args, **kwargs)
        torch.save(model.state_dict(), PATH)
        return res
    return wrapper

In [44]:
@load_model
def train(model, data, nums_epoches=20):
    for epoch in range(nums_epoches):
        model.train()
        total_num_words = 0
        total_loss = 0
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = mb_x.to(device).long()
            mb_x_len = mb_x_len.to(device).long()
            mb_input = mb_y[:, :-1].to(device).long()
            mb_output = mb_y[:, 1:].to(device).long()
            mb_y_len = (mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            optimizer.step()
            
            if it % 100 == 0:
                print("Epoch", epoch, "iteration", it, "loss", loss.item())

                
        print("Epoch", epoch, "Training loss", total_loss/total_num_words)
        if epoch % 5 == 0:
            evaluate(model, dev_data)

In [45]:
@load_model
def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    with torch.no_grad():
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = mb_x.to(device).long()
            mb_x_len = mb_x_len.to(device).long()
            mb_input = mb_y[:, :-1].to(device).long()
            mb_output = mb_y[:, 1:].to(device).long()
            mb_y_len = (mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device).unsqueeze(0) < mb_y_len.unsqueeze(1)
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
    print("Evaluation loss", total_loss/total_num_words)

In [46]:
def translate_dev(i):
    en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])
    print(en_sent)
    cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])
    print("".join(cn_sent))

    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)

    translation, attn = model.translate(mb_x, mb_x_len, bos)
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print("".join(trans))

In [47]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dropout = 0.2
hidden_size = 100
encoder = PlainEncoder(vocab_size=en_total_words,
                      hidden_size=hidden_size,
                      dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words,
                      hidden_size=hidden_size,
                      dropout=dropout)
model = PlainSeq2Seq(encoder, decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

In [48]:
train(model, train_data, 100)

Epoch 0 iteration 0 loss 9.344791412353516
Epoch 0 iteration 100 loss 5.557158946990967
Epoch 0 iteration 200 loss 5.195974826812744
Epoch 0 Training loss 5.895928513083067
Evaluation loss 5.207137123912003
Epoch 1 iteration 0 loss 5.177928447723389
Epoch 1 iteration 100 loss 5.007741928100586
Epoch 1 iteration 200 loss 4.733947277069092
Epoch 1 Training loss 4.9570539737987955
Epoch 2 iteration 0 loss 4.783565044403076
Epoch 2 iteration 100 loss 4.671952724456787
Epoch 2 iteration 200 loss 4.449291229248047
Epoch 2 Training loss 4.634700314986502
Epoch 3 iteration 0 loss 4.506902694702148
Epoch 3 iteration 100 loss 4.426587104797363
Epoch 3 iteration 200 loss 4.198984622955322
Epoch 3 Training loss 4.397497916280977
Epoch 4 iteration 0 loss 4.277348041534424
Epoch 4 iteration 100 loss 4.226634979248047
Epoch 4 iteration 200 loss 4.011811256408691
Epoch 4 Training loss 4.193308606073926
Epoch 5 iteration 0 loss 4.072834014892578
Epoch 5 iteration 100 loss 4.041784763336182
Epoch 5 iter

In [97]:
for i in range(100,120):
    translate_dev(i)
    print()

BOS you have nice skin . EOS
BOS 你 的 皮膚 真好 。 EOS
你的時候你。

BOS you &#39;re UNK correct . EOS
BOS 你 UNK 正确 。 EOS
你的狗。

BOS everyone admired his courage . EOS
BOS 每個 人 都 佩服 他 的 勇氣 。 EOS
他的人都是我的。

BOS what time is it ? EOS
BOS 几点 了 ？ EOS
你的名字？

BOS i &#39;m free tonight . EOS
BOS 我 今晚 有空 。 EOS
我不喜欢。

BOS here is your book . EOS
BOS 這是 你 的 書 。 EOS
你的人是你的。

BOS they are at lunch . EOS
BOS 他们 在 吃 午饭 。 EOS
他們在這裡。

BOS this chair is UNK . EOS
BOS 這把 椅子 UNK 。 EOS
这是个的。

BOS it &#39;s pretty heavy . EOS
BOS 它 UNK 。 EOS
这是我的。

BOS many attended his funeral . EOS
BOS 很多 人 都 参加 了 他 的 UNK 。 EOS
他是个的人。

BOS training will be provided . EOS
BOS 会 有 训练 。 EOS
请是个的。

BOS someone is watching you . EOS
BOS 有人 在 看 著 你 。 EOS
你的時候你。

BOS i slapped his face . EOS
BOS 我 摑 了 他 的 臉 。 EOS
我的朋友。

BOS i like UNK music . EOS
BOS 我 喜歡 流行 音樂 。 EOS
我不喜欢。

BOS tom had no children . EOS
BOS Tom 沒有 孩子 。 EOS
汤姆在這裡。

BOS please lock the door . EOS
BOS 請 把 UNK 上 。 EOS
請把我的。

BOS tom has calmed down . EOS
BOS 汤姆 冷静下来 了 。 EOS
汤姆在這