In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders, text2tokens
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, EncoderDecoderAttn, DecoderAttnRNN
from train_eval import count_parameters, summarize_results, plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval
import importlib
import pickle as pkl 
import torch

In [2]:
# model identification
MODEL_NAME = 'zh-seq2seq-rnn-vanilla'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 #30000
TARG_VOCAB_SIZE = 30000 #30000

# model architecture params 
RNN_CELL_TYPE = 'gru'
NUM_LAYERS = 2 #2 
ENC_HIDDEN_DIM = 256 #512
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM #2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0.2 # to actually implement
DEC_DROPOUT = 0.2 # to actually implement
USE_ATTN = True

# training params  
BATCH_SIZE = 32 #32
NUM_EPOCHS = 200
LR = 0.0003 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = True

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 'rnn_cell_type': RNN_CELL_TYPE, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 'use_attn': USE_ATTN, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [5]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, filter_long=False)
data_minibatch = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=BATCH_SIZE, filter_long=False) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=1000, filter_long=False)

In [6]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [7]:
# define model 

encoder = EncoderRNN(rnn_cell_type=RNN_CELL_TYPE, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                     src_max_sentence_len=SRC_MAX_SENTENCE_LEN, enc_dropout=ENC_DROPOUT, 
                     pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

# without attention 
# decoder = DecoderRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS,
#                      targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                      pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id'])

# with additive attention 
# decoder = DecoderAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
#                          num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, attention_type='additive',
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

# with multiplicative attention 
decoder = DecoderAttnRNN(rnn_cell_type=RNN_CELL_TYPE, dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, 
                         num_layers=NUM_LAYERS, targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
                         targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, dec_dropout=DEC_DROPOUT, attention_type='multiplicative',
                         pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoderAttn(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [8]:
model, results = train_and_eval(
    model=model, loaders_full=loaders_full, loaders_minibatch=loaders_minibatch, loaders_minitrain=loaders_minitrain, 
    params=params, vocab=vocab, print_intermediate=10000, save_checkpoint=True, save_to_log=True, 
    lazy_eval=False, print_attn=True, inspect_samples=1)

Finished gradient updates at 1.7702600955963135
Evaluated on validation set at 2.1121270656585693 seconds
Evaluated on training set at 2.4443230628967285 seconds
Appended results at 2.444643259048462 seconds
Epoch: 0.00, Train Loss: 10.18, Val Loss: 10.26, Train BLEU: 2.23, Val BLEU: 0.17, Minutes Elapsed: 0.04
Sampling from training predictions...
Source: 我们 将 用 一些 影片 来讲 讲述 一些 深海 海里
Reference: and we &apos;re going to tell you some stories
Model: <SOS> &apos;s the the the the the the the the
Attention Weights: tensor([[2.0814e-04, 4.1357e-02, 2.9225e-01, 1.8755e-01, 2.5040e-01, 1.6599e-01,
         5.5885e-02, 5.9584e-03, 3.8446e-04, 4.9128e-06],
        [5.4272e-03, 7.4556e-02, 2.1089e-01, 1.8136e-01, 2.2277e-01, 1.7409e-01,
         9.2658e-02, 2.9833e-02, 7.5984e-03, 8.1563e-04],
        [2.2758e-02, 9.1142e-02, 1.6631e-01, 1.6324e-01, 1.9013e-01, 1.6013e-01,
         1.0955e-01, 6.0266e-02, 2.8474e-02, 8.0026e-03],
        [5.1483e-02, 9.9991e-02, 1.3745e-01, 1.4180e-01, 1.5429e-0

Saved checkpoint at 8.485828161239624 seconds
Finished gradient updates at 10.203670024871826
Evaluated on validation set at 10.530593156814575 seconds
Evaluated on training set at 10.861161947250366 seconds
Appended results at 10.861316204071045 seconds
Epoch: 3.00, Train Loss: 9.55, Val Loss: 10.02, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.18
Sampling from training predictions...
Source: 我们 得用 非常 特殊 的 仪器 才能 能到 到达 那个
Reference: we have to have a very special technology to
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[1.5474e-08, 7.3774e-08, 5.1423e-04, 5.8693e-02, 8.4970e-01, 8.9045e-02,
         2.0430e-03, 9.7378e-07, 7.2937e-06, 2.7249e-08],
        [2.0058e-05, 3.9815e-05, 8.2459e-03, 1.4830e-01, 6.6451e-01, 1.5972e-01,
         1.8413e-02, 1.9056e-04, 5.4368e-04, 1.4602e-05],
        [3.0665e-04, 4.2790e-04, 2.1055e-02, 1.9301e-01, 5.6003e-01, 1.8187e-01,
         3.9506e-02, 1.2290e-03, 2.4223e-03, 1.4258e-04],
        [1.3132e-03, 1.92

Saved checkpoint at 16.701471090316772 seconds
Finished gradient updates at 18.49996519088745
Evaluated on validation set at 18.802311182022095 seconds
Evaluated on training set at 19.13669204711914 seconds
Appended results at 19.13707208633423 seconds
Epoch: 6.00, Train Loss: 8.52, Val Loss: 9.61, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.32
Sampling from training predictions...
Source: 大卫 <UNK> 通过 潜水 潜水艇 拍下 的 影片 把 我们
Reference: with vibrant video clips captured by submarines , david
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[1.0324e-25, 6.1997e-23, 3.3924e-13, 4.4032e-09, 2.6853e-10, 1.0525e-03,
         9.6070e-01, 3.8198e-02, 4.8329e-05, 4.6713e-15],
        [3.8198e-16, 1.6979e-14, 1.6273e-08, 5.9481e-06, 1.6523e-06, 1.7634e-02,
         8.8756e-01, 9.3879e-02, 9.2386e-04, 5.4236e-10],
        [4.6926e-13, 9.9877e-12, 4.9523e-07, 5.4630e-05, 2.5450e-05, 4.1052e-02,
         8.3186e-01, 1.2454e-01, 2.4637e-03, 2.6227e-08],
        [7.092

Saved checkpoint at 25.02724814414978 seconds
Finished gradient updates at 27.732924222946167
Evaluated on validation set at 28.113031148910522 seconds
Evaluated on training set at 28.61577820777893 seconds
Appended results at 28.61588406562805 seconds
Epoch: 9.00, Train Loss: 7.49, Val Loss: 9.20, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.48
Sampling from training predictions...
Source: 我 真 喜欢 这些 东西 <EOS> <PAD> <PAD> <PAD> <PAD>
Reference: i love that kind of stuff . <EOS> <PAD>
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0428, 0.9572, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0762, 0.9214, 0.0024, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0667, 0.9224, 0.0110, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0552, 0.9228, 0.0221, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0493, 0.9195, 0.0312,

Saved checkpoint at 34.82652020454407 seconds
Finished gradient updates at 36.64821124076843
Evaluated on validation set at 36.97070908546448 seconds
Evaluated on training set at 37.315118074417114 seconds
Appended results at 37.31521916389465 seconds
Epoch: 12.00, Train Loss: 6.57, Val Loss: 8.85, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.62
Sampling from training predictions...
Source: 大卫 <UNK> 这位 是 比尔 <UNK> 我 是 大卫 <UNK>
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[0.0000, 0.0000, 0.0000, 0.0002, 0.0000, 0.0000, 0.9996, 0.0002, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0071, 0.0000, 0.0006, 0.9906, 0.0018, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0158, 0.0000, 0.0043, 0.9746, 0.0052, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0237, 0.0002, 0.0079, 0.9591, 0.0091, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0298, 0.00

Saved checkpoint at 43.54687428474426 seconds
Finished gradient updates at 45.90603423118591
Evaluated on validation set at 46.297362327575684 seconds
Evaluated on training set at 46.702704191207886 seconds
Appended results at 46.70283222198486 seconds
Epoch: 15.00, Train Loss: 5.79, Val Loss: 8.63, Train BLEU: 0.28, Val BLEU: 0.19, Minutes Elapsed: 0.78
Sampling from training predictions...
Source: 我们 将 用 一些 影片 来讲 讲述 一些 深海 海里
Reference: and we &apos;re going to tell you some stories
Model: <SOS> the the the the the the the the the
Attention Weights: tensor([[6.4151e-41, 1.4794e-15, 2.5837e-05, 3.0944e-02, 5.9068e-01, 3.7360e-01,
         4.7494e-03, 5.7849e-08, 3.0484e-19, 2.8026e-44],
        [4.4546e-24, 6.6034e-11, 1.7011e-04, 8.0866e-02, 6.5132e-01, 2.5907e-01,
         8.5676e-03, 5.8062e-06, 9.0310e-13, 1.0399e-25],
        [4.2198e-21, 6.2620e-10, 3.3910e-04, 1.0224e-01, 6.4416e-01, 2.4187e-01,
         1.1370e-02, 2.1857e-05, 3.9990e-11, 1.7156e-21],
        [4.3660e-20, 1.455

KeyboardInterrupt: 

In [None]:
summarize_results(load_experiment_log())[['dt_created', 'num_epochs', 'learning_rate', 'clip_grad_max_norm', 'val_loss']].head()

In [None]:
plot_single_learning_curve(results)

In [None]:
# Epoch: 199.00, Train Loss: 0.32, Val Loss: 13.19, Train BLEU: 98.94, Val BLEU: 0.27
plot_single_learning_curve(results)

In [None]:
# with attention energies = v_broadcast.bmm(torch.tanh(self.attn(concat)).transpose(1, 2)) # switched order  
# Epoch: 199.00, Train Loss: 0.63, Val Loss: 12.82, Train BLEU: 92.05, Val BLEU: 0.38
plot_single_learning_curve(results)

In [None]:
for i, token in enumerate(vocab[SRC_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
for i, token in enumerate(vocab[TARG_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

In [None]:
import torch
x = torch.arange(0, 3*5*10).view(3, 5, 10)
print(x)
y = x[1:, :, :]
print(y)
z = y.view(-1, 10)
print(z)

In [None]:
t = torch.arange(0, 2*5).view(5, 2)
print(t)
u = t.contiguous().view(-1)
print(u)
v = t.permute(1, 0)
print(v)
w = v.contiguous().view(-1)
print(w)

In [None]:
a = torch.arange(0, 2*1*300)
print(a)
b = a.view(-1, 1, 300)
print(b.size())

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(loaders_full['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 

In [None]:
#attn.data.masked_fill_(self.mask, -float('inf'))
test_tensor.

test_tensor.data.masked_fill_(test_tensor == 2, float('inf'))