In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders 
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, DecoderSimpleRNN, EncoderSimpleRNN, \
    Attention, DecoderAttnRNN, DecoderRNNV2
from train_eval import train_and_eval, inspect_model, count_parameters, summarize_results, \
    plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval_V3 #, tensor2corpus_V2
import importlib
import pickle as pkl 

In [15]:
# model identification
MODEL_NAME = 'test_model'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 #30000
TARG_VOCAB_SIZE = 30000 #30000

# model architecture params 
NUM_LAYERS = 2 #2 
ENC_HIDDEN_DIM = 300 
DEC_HIDDEN_DIM = ENC_HIDDEN_DIM #2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0 # to actually implement
DEC_DROPOUT = 0 # to actually implement

# training params  
BATCH_SIZE = 5 #32
NUM_EPOCHS = 200
LR = 0.0005 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = True 

In [16]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [17]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [18]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, vocab)
data_1Batch = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=BATCH_SIZE) 
data_10K = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=10000)

In [19]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab-fake.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [20]:
# vocab_filename = "{}-{}-vocab-fake.p".format(SRC_LANG, TARG_LANG)
# vocab = pkl.load(open(vocab_filename, "rb"))
# data = process_data(SRC_LANG, TARG_LANG, vocab)
# limited_data = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=BATCH_SIZE) 

In [21]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_lazy = create_dataloaders(data_1Batch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_10K = create_dataloaders(data_10K, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [22]:
# define model 

# encoder = EncoderRNN(enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
#                      pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
encoder = EncoderSimpleRNN(enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
                           pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

decoder = DecoderRNNV2(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                       targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
                       pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderSimpleRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                            targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                            pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderAttnRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                          targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))

model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [23]:
for i, token in enumerate(vocab[SRC_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

0: <SOS>
1: <EOS>
2: <PAD>
3: <UNK>
4: 的
5: 我
6: 是
7: 我们
8: 在
9: 了
10: 你
11: 这
12: 一个
13: 他们
14: 和
15: 有
16: 它
17: 就
18: 这个
19: 他


In [24]:
for i, token in enumerate(vocab[TARG_LANG]['id2token']): 
    if i < 20: 
        print("{}: {}".format(i, token))

0: <SOS>
1: <EOS>
2: <PAD>
3: <UNK>
4: ,
5: .
6: the
7: and
8: to
9: of
10: a
11: that
12: i
13: in
14: it
15: you
16: we
17: is
18: &apos;s
19: this


In [25]:
import torch
x = torch.arange(0, 3*5*10).view(3, 5, 10)
print(x)
y = x[1:, :, :]
print(y)
z = y.view(-1, 10)
print(z)

tensor([[[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9],
         [ 10,  11,  12,  13,  14,  15,  16,  17,  18,  19],
         [ 20,  21,  22,  23,  24,  25,  26,  27,  28,  29],
         [ 30,  31,  32,  33,  34,  35,  36,  37,  38,  39],
         [ 40,  41,  42,  43,  44,  45,  46,  47,  48,  49]],

        [[ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59],
         [ 60,  61,  62,  63,  64,  65,  66,  67,  68,  69],
         [ 70,  71,  72,  73,  74,  75,  76,  77,  78,  79],
         [ 80,  81,  82,  83,  84,  85,  86,  87,  88,  89],
         [ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99]],

        [[100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
         [110, 111, 112, 113, 114, 115, 116, 117, 118, 119],
         [120, 121, 122, 123, 124, 125, 126, 127, 128, 129],
         [130, 131, 132, 133, 134, 135, 136, 137, 138, 139],
         [140, 141, 142, 143, 144, 145, 146, 147, 148, 149]]])
tensor([[[ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59],
         [ 60,  61

In [26]:
t = torch.arange(0, 2*5).view(5, 2)
print(t)
u = t.contiguous().view(-1)
print(u)
v = t.permute(1, 0)
print(v)
w = v.contiguous().view(-1)
print(w)

tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])
tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
tensor([[0, 2, 4, 6, 8],
        [1, 3, 5, 7, 9]])
tensor([0, 2, 4, 6, 8, 1, 3, 5, 7, 9])


In [None]:
model, results = train_and_eval_V3(
    model=model, loaders_full=loaders_full, loaders_lazy=loaders_lazy, loaders_10K=loaders_10K, 
    params=params, vocab=vocab, print_intermediate=True, save_checkpoint=True, save_to_log=True, print_summary=True,
    inspect_samples=1)

Epoch: 0.00, Train Loss: 10.02, Val Loss: 10.32, Train BLEU: 0.23, Val BLEU: 0.12
Sampling from training predictions...
Source: 深海 海中 的 生命 大卫 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: life in the deep oceans <EOS> <PAD> <PAD> <PAD>
Model: <SOS> in in deep deep deep deep seated seated seated

Sampling from val predictions...
Source: 他 面带 <UNK> <UNK> 笑容 这 很少 少见 因为 大部
Reference: there was a big smile on his face which
Model: <SOS> developmentally residency residency loud casework developer spoiler spoiler spoiler

Epoch: 1.00, Train Loss: 9.73, Val Loss: 10.31, Train BLEU: 5.55, Val BLEU: 0.81
Sampling from training predictions...
Source: 我们 这 有 不少 精彩 的 泰坦 泰坦尼克 坦尼 尼克
Reference: we &apos;ve got some of the most incredible video
Model: <SOS> we we we residency incredible incredible incredible spoiler spoiler

Sampling from val predictions...
Source: 我 的 父亲 在 用 他 的 灰色 小 收音
Reference: my father was listening to bbc news on his
Model: <SOS> we we &apos;re &apos;re &apos;re andres andres andres 

Epoch: 18.00, Train Loss: 3.25, Val Loss: 10.58, Train BLEU: 12.18, Val BLEU: 0.75
Sampling from training predictions...
Source: 我们 将 用 一些 影片 来讲 讲述 一些 深海 海里
Reference: and we &apos;re going to tell you some stories
Model: <SOS> we we &apos;re &apos;re &apos;re to to to to

Sampling from val predictions...
Source: 我 不知 知道 那 意味 意味着 什么 但是 我 能
Reference: i didn &apos;t know what it meant , but
Model: <SOS> we we &apos;re &apos;re &apos;re to to to to

Epoch: 19.00, Train Loss: 3.03, Val Loss: 10.75, Train BLEU: 14.52, Val BLEU: 0.90
Sampling from training predictions...
Source: 大卫 <UNK> 通过 潜水 潜水艇 拍下 的 影片 把 我们
Reference: with vibrant video clips captured by submarines , david
Model: <SOS> and we &apos;re &apos;re to to to clips clips

Sampling from val predictions...
Source: 我 的 父亲 在 用 他 的 灰色 小 收音
Reference: my father was listening to bbc news on his
Model: <SOS> we we &apos;re &apos;re to to to to to

Epoch: 20.00, Train Loss: 2.85, Val Loss: 10.94, Train BLEU: 14.70, Val BLEU: 0.90
Sampli

Epoch: 36.00, Train Loss: 1.53, Val Loss: 13.28, Train BLEU: 56.93, Val BLEU: 1.14
Sampling from training predictions...
Source: 我们 将 用 一些 影片 来讲 讲述 一些 深海 海里
Reference: and we &apos;re going to tell you some stories
Model: <SOS> and we &apos;re going to tell tell stories stories

Sampling from val predictions...
Source: 我 的 父亲 在 用 他 的 灰色 小 收音
Reference: my father was listening to bbc news on his
Model: <SOS> and we &apos;re going to tell tell stories stories

Epoch: 37.00, Train Loss: 1.48, Val Loss: 13.37, Train BLEU: 56.46, Val BLEU: 1.14
Sampling from training predictions...
Source: 我们 将 用 一些 影片 来讲 讲述 一些 深海 海里
Reference: and we &apos;re going to tell you some stories
Model: <SOS> and we &apos;re going to to tell stories stories

Sampling from val predictions...
Source: 我 不知 知道 那 意味 意味着 什么 但是 我 能
Reference: i didn &apos;t know what it meant , but
Model: <SOS> and we &apos;re going to some some stories stories

Epoch: 38.00, Train Loss: 1.43, Val Loss: 13.45, Train BLEU: 57.17, Val BLE

Epoch: 54.00, Train Loss: 0.84, Val Loss: 14.33, Train BLEU: 89.61, Val BLEU: 1.14
Sampling from training predictions...
Source: 深海 海中 的 生命 大卫 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: life in the deep oceans <EOS> <PAD> <PAD> <PAD>
Model: <SOS> life in the deep oceans <EOS> <EOS> <EOS> <EOS>

Sampling from val predictions...
Source: 我 11 岁 那年 记得 得有 一天 早晨 醒来 听见
Reference: when i was 11 , i remember waking up
Model: <SOS> and we &apos;re going to tell you stories stories

Epoch: 55.00, Train Loss: 0.81, Val Loss: 14.39, Train BLEU: 94.60, Val BLEU: 1.14
Sampling from training predictions...
Source: 我们 这 有 不少 精彩 的 泰坦 泰坦尼克 坦尼 尼克
Reference: we &apos;ve got some of the most incredible video
Model: <SOS> we &apos;ve got some of the most incredible incredible

Sampling from val predictions...
Source: 他 面带 <UNK> <UNK> 笑容 这 很少 少见 因为 大部
Reference: there was a big smile on his face which
Model: <SOS> with vibrant video clips captured by , david david

Epoch: 56.00, Train Loss: 0.78, Val Loss: 14.4

Epoch: 72.00, Train Loss: 0.46, Val Loss: 15.25, Train BLEU: 100.00, Val BLEU: 1.10
Sampling from training predictions...
Source: 大卫 <UNK> 这位 是 比尔 <UNK> 我 是 大卫 <UNK>
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> this is bill lange . i &apos;m dave gallo

Sampling from val predictions...
Source: 我 不知 知道 那 意味 意味着 什么 但是 我 能
Reference: i didn &apos;t know what it meant , but
Model: <SOS> and we &apos;re going to tell you some stories

Epoch: 73.00, Train Loss: 0.45, Val Loss: 15.22, Train BLEU: 100.00, Val BLEU: 1.14
Sampling from training predictions...
Source: 我们 这 有 不少 精彩 的 泰坦 泰坦尼克 坦尼 尼克
Reference: we &apos;ve got some of the most incredible video
Model: <SOS> we &apos;ve got some of the most incredible video

Sampling from val predictions...
Source: 我 11 岁 那年 记得 得有 一天 早晨 醒来 听见
Reference: when i was 11 , i remember waking up
Model: <SOS> and we &apos;re going to tell you stories stories

Epoch: 74.00, Train Loss: 0.43, Val Loss: 15.32, Train BLEU: 100.00, Val BLEU: 1

In [None]:
summarize_results(load_experiment_log())[['dt_created', 'num_epochs', 'learning_rate', 'clip_grad_max_norm', 'val_loss']].head()

In [None]:
plot_single_learning_curve(results)

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(full_loaders['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 