In [10]:
import sys, os
import collections
import zipfile
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim

from util import *
from model import *

root_path = os.path.abspath('../../')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
src_url = os.path.join(root_path, 'inputs/translate/fra-eng/fra.txt')
with open(src_url , 'r') as f:
      raw_text = f.read()
tp = TextPreprocessor(raw_text, num_lines = 50000)
print('\n', len(tp.src_vocab), len(tp.tar_vocab))


 3774 5085


In [12]:
tu = TextUtil(tp, max_len = 10)
src_vocab, tar_vocab, train_loader = tu.load_data_nmt(batch_size = 2)

In [13]:
for X, X_valid_len, Y, Y_valid_len, in train_loader:
    print('X =', X.type(torch.int32), '\nValid lengths for X =', X_valid_len,
        '\nY =', Y.type(torch.int32), '\nValid lengths for Y =', Y_valid_len)
    break

X = tensor([[   1,  211,  393,  509,    5,    2,    0,    0,    0,    0],
        [   1,   17,  137,   22, 2153, 2788,    5,    2,    0,    0]],
       dtype=torch.int32) 
Valid lengths for X = tensor([6, 8]) 
Y = tensor([[  94,  388,  956, 3879,    7,    0,    0,    0,    0,    0],
        [  26, 1161,   71,  177,  489, 4683, 1925,    7,    0,    0]],
       dtype=torch.int32) 
Valid lengths for Y = tensor([5, 8])


In [14]:
encoder = Seq2SeqEncoder(10, 8, 32, 2)
X = torch.zeros((4, 7), dtype=torch.long)
output, encoded_state = encoder(X)
print(output.shape, len(encoded_state), encoded_state[0].shape, encoded_state[1].shape)

decoder = Seq2SeqDecoder(10, 8, 32, 2)
state = decoder.init_state((output, encoded_state))
out, state = decoder(X, state)
print(out.shape, len(state), state[0].shape, state[1].shape)

torch.Size([7, 4, 32]) 2 torch.Size([2, 4, 32]) torch.Size([2, 4, 32])
torch.Size([4, 7, 10]) 2 torch.Size([2, 4, 32]) torch.Size([2, 4, 32])


In [15]:
loss = MaskedSoftmaxCELoss()
loss(torch.ones((3, 4, 10)), torch.ones((3,4),dtype=torch.long), torch.tensor([4,3,0]))

cpu cpu


tensor([ 2.3026e+00, -5.7564e+05, -2.3026e+06])

In [16]:
def train_ch7(model, data_iter, lr, num_epochs, device):  # Saved in d2l
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    tic = time.time()
    for epoch in range(1, num_epochs+1):
        l_sum, num_tokens_sum = 0.0, 0.0
        for batch in data_iter:
            optimizer.zero_grad()
            X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch]
            Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
            
            Y_hat, _ = model(X, Y_input)
            l = loss(Y_hat, Y_label, Y_vlen).sum()
            l.backward()

            with torch.no_grad():
                grad_clipping_nn(model, 5, device)
            num_tokens = Y_vlen.sum().item()
            optimizer.step()
            l_sum += l.sum().item()
            num_tokens_sum += num_tokens
        if epoch % 50 == 0:
            print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format( 
                  epoch, (l_sum/num_tokens_sum), time.time()-tic))
            tic = time.time()

In [12]:
# embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.3
# batch_size, num_examples, max_len = 64, 1e3, 10
# lr, num_epochs = 0.005, 300
# tp = TextPreprocessor(raw_text, num_lines=num_examples)
# tu = TextUtil(tp, max_len = max_len)
# src_vocab, tar_vocab, train_loader = tu.load_data_nmt(batch_size = batch_size)
# encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
# decoder = Seq2SeqDecoder(len(tar_vocab), embed_size, num_hiddens, num_layers, dropout)
# model = EncoderDecoder(encoder, decoder)
# train_ch7(model, train_loader, lr, num_epochs, device=device)

In [13]:
# !cat /home/kesci/input/d2l9528/d2l/train.py

In [46]:
def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, device):
    src_tokens = src_vocab[src_sentence.lower().split(' ')]
    src_len = len(src_tokens)
    if src_len < max_len:
        src_tokens += [src_vocab.pad] * (max_len - src_len)
    enc_X = torch.tensor(src_tokens, device=device)
    enc_valid_length = torch.tensor([src_len], device=device)
    # use expand_dim to add the batch_size dimension.
    encoded_state = model.encoder(enc_X.unsqueeze(dim=0), enc_valid_length)[1]
    dec_state = model.decoder.init_state(encoded_state, enc_valid_length)
    dec_X = torch.tensor([tgt_vocab.bos], device=device).unsqueeze(dim=0)
    predict_tokens = []
    for _ in range(max_len):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # The token with highest score is used as the next time step input.
        dec_X = Y.argmax(dim=2)
        py = dec_X.squeeze(dim=0).int().item()
        if py == tgt_vocab.eos:
            break
        predict_tokens.append(py)
    return ' '.join(tgt_vocab.to_tokens(predict_tokens))

In [47]:
for sentence in ['What is your name ? .', 'How are you ?', "I'm OK .", 'egg !', 'I like milk']:
    print(sentence + ' => ' + translate_ch7(
        model, sentence, src_vocab, tar_vocab, max_len, device))

What is your name ? . =>   . de  . de  . de
How are you ? =>  calme . de  .  . . là
I'm OK . => va emporté .  .  . . ? ?
egg ! =>  bon !  ! nous ?  . ?
I like milk =>  . . de la  . de  .


In [48]:
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
    print(sentence + ' => ' + translate_ch7(
        model, sentence, src_vocab, tar_vocab, max_len, device))

Go . =>  ! feu ! ! maison . pas de 
Wow ! =>  !  .  . . ? ? 
I'm OK . => va fait . . ? . nous ? . ?
I won ! => un boulot !  .  . . pas .


In [14]:
from model import *
import math

dtype = torch.float32
def get_random_qkv(batch_size, step, dim):
    query = torch.ones((batch_size, 1, dim), dtype=dtype)
    value      = torch.arange(step * dim, dtype=dtype).view(1, step, dim).repeat(batch_size, 1, 1)
    key = torch.ones((batch_size, step, dim), dtype=dtype)
    return query, key, value


In [15]:
# atten = DotProductAttention(0)
batch_size, step, dim = 5, 10 ,16
atten = MLPAttention(dim, dim, 0)
# atten = MLPAttention_v2(dim, dim, 0)
q, k, v = get_random_qkv(batch_size, step, dim)
print(q.shape, k.shape, v.shape)
valid_len = torch.tensor([ 0,  2,  8,  6, 5], dtype=dtype)
print(valid_len)
atten(q, k , v, valid_len ).shape

torch.Size([5, 1, 16]) torch.Size([5, 10, 16]) torch.Size([5, 10, 16])
tensor([0., 2., 8., 6., 5.])
cpu cpu


torch.Size([5, 1, 16])

In [20]:
import torch
import torch.nn as nn
class AttentionSeq2SeqDecoder(Decoder):
    def __init__(self,  vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs):
        super(AttentionSeq2SeqDecoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size + num_hiddens, num_hiddens,  num_layers, dropout=dropout)
        self.dense = nn.Linear(num_hiddens, vocab_size)
        self.atten = MLPAttention(num_hiddens, num_hiddens, dropout=dropout)
    
    def init_state(self, enc_outputs, enc_valid_len=None, *args):
        outputs, hidden_state = enc_outputs
        return (outputs.permute(1, 0, -1), hidden_state, enc_valid_len)
    
    def forward(self, X, state):
        enc_outputs, hidden_state, enc_valid_len = state
        X = self.embedding(X).transpose(0,1)
        outputs = []
        for x in X:
            query = hidden_state[0][-1].unsqueeze(1)
            context = self.atten(query, enc_outputs, enc_outputs, enc_valid_len)
            x = torch.cat((context, x.unsqueeze(1)), dim=-1)
            out, hidden_state = self.rnn(x.transpose(0, 1), hidden_state)
            outputs.append(out)
        outputs = self.dense(torch.cat(outputs, dim=0))
        return outputs.transpose(0, 1), [enc_outputs, hidden_state, enc_valid_len]
    
def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, max_len, device):
    src_tokens = src_vocab[src_sentence.lower().split(' ')]
    src_len = len(src_tokens)
    if src_len < max_len:
        src_tokens += [src_vocab.pad] * (max_len - src_len)
    enc_X = torch.tensor(src_tokens, device=device)
    enc_valid_length = torch.tensor([src_len], device=device)
    # use expand_dim to add the batch_size dimension.
    enc_outputs = model.encoder(enc_X.unsqueeze(dim=0), enc_valid_length)
    dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
    dec_X = torch.tensor([tgt_vocab.bos], device=device).unsqueeze(dim=0)
    predict_tokens = []
    for _ in range(max_len):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # The token with highest score is used as the next time step input.
        dec_X = Y.argmax(dim=2)
        py = dec_X.squeeze(dim=0).int().item()
        if py == tgt_vocab.eos:
            break
        predict_tokens.append(py)
    return ' '.join(tgt_vocab.to_tokens(predict_tokens))

def train_s2s_ch9(model, data_iter, lr, num_epochs, device):  # Saved in d2l
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    tic = time.time()
    for epoch in range(1, num_epochs+1):
        l_sum, num_tokens_sum = 0.0, 0.0
        for batch in data_iter:
            optimizer.zero_grad()
            X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch]
            Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
#             print(Y_vlen.device)
            Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
            l = loss(Y_hat, Y_label, Y_vlen).sum()
            l.backward()

            with torch.no_grad():
                grad_clipping_nn(model, 5, device)
            num_tokens = Y_vlen.sum().item()
            optimizer.step()
            l_sum += l.sum().item()
            num_tokens_sum += num_tokens
        if epoch % 50 == 0:
            print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format( 
                  epoch, (l_sum/num_tokens_sum), time.time()-tic))
            tic = time.time()

In [21]:
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
decoder = AttentionSeq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7), dtype=torch.long)
print("batch size=4\nseq_length=7\nhidden dim=16\nnum_layers=2\n")
print('encoder output size:', encoder(X)[0].size())
print('encoder hidden size:', encoder(X)[1][0].size())
print('encoder memory size:', encoder(X)[1][1].size())
state = decoder.init_state(encoder(X), None)
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape

batch size=4
seq_length=7
hidden dim=16
num_layers=2

encoder output size: torch.Size([7, 4, 16])
encoder hidden size: torch.Size([2, 4, 16])
encoder memory size: torch.Size([2, 4, 16])


(torch.Size([4, 7, 10]), 3, torch.Size([4, 7, 16]), 2, torch.Size([2, 4, 16]))

In [22]:
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.3
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs = 0.005, 300
tp = TextPreprocessor(raw_text, num_lines=num_examples)
tu = TextUtil(tp, max_len = max_len)
src_vocab, tar_vocab, train_loader = tu.load_data_nmt(batch_size = batch_size)
encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = AttentionSeq2SeqDecoder(len(tar_vocab), embed_size, num_hiddens, num_layers, dropout)
model = EncoderDecoder(encoder, decoder)
train_s2s_ch9(model, train_loader, lr, num_epochs, device=device)

cpu
cpu cuda:0


RuntimeError: expected device cuda:0 but got device cpu