In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders 
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, DecoderSimpleRNN, \
    Attention, DecoderAttnRNN
from train_eval import train_and_eval, inspect_model, count_parameters, summarize_results, \
    plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval_V2, tensor2corpus_V2
import importlib
import pickle as pkl 

In [2]:
# model identification
MODEL_NAME = 'test_model'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10 
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000
TARG_VOCAB_SIZE = 30000

# model architecture params 
NUM_LAYERS = 1 #2 
ENC_HIDDEN_DIM = 300 
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 0.5
CLIP_GRAD_MAX_NORM = 10
ENC_DROPOUT = 0 # to actually implement
DEC_DROPOUT = 0 # to actually implement

# training params  
BATCH_SIZE = 16 #32
NUM_EPOCHS = 200
LR = 0.001 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = True 

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [5]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, vocab)
limited_data = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=BATCH_SIZE) 

In [6]:
# create dataloaders 
full_loaders = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
fast_loaders = create_dataloaders(limited_data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [7]:
# define model 

encoder = EncoderRNN(enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
                     pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
decoder = DecoderSimpleRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                           targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
                           pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderAttnRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                          targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [8]:
model, results = train_and_eval_V2(
    model=model, full_loaders=full_loaders, fast_loaders=fast_loaders, params=params, vocab=vocab, 
    print_intermediate=True, save_checkpoint=True, lazy_eval=False, inspect_iter=100, save_to_log=True, print_summary=True)

Epoch: 0.00, Train Loss: 9.85, Val Loss: 10.23, Train BLEU: 0.37, Val BLEU: 0.22
Sampling from training predictions...
Source: 原因 在于 我们 一直 没 把 海洋 当回事 回事 回事儿
Reference: and the problem , i think , is that
Model: <SOS> deep deep the the the the the the the

Sampling from val predictions...
Source: 你 现在 可以 去 个 真正 的 学校 念书 了
Reference: &quot; you can go to a real school now
Model: <SOS> deep lange pixels the the the the the the

Epoch: 1.00, Train Loss: 9.27, Val Loss: 10.09, Train BLEU: 0.37, Val BLEU: 0.22
Sampling from training predictions...
Source: 深海 海中 的 生命 大卫 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: life in the deep oceans <EOS> <PAD> <PAD> <PAD>
Model: <SOS> the the the the the the the the the

Sampling from val predictions...
Source: 我们 都 知道 自己 正 冒 着 生命 的 危险
Reference: we all knew we were risking our lives --
Model: <SOS> deep the the the the the the the the

Epoch: 2.00, Train Loss: 8.34, Val Loss: 9.85, Train BLEU: 0.36, Val BLEU: 0.22
Sampling from training predictions...
Sourc

Epoch: 19.00, Train Loss: 3.12, Val Loss: 13.13, Train BLEU: 0.35, Val BLEU: 0.23
Sampling from training predictions...
Source: 深海 海中 的 生命 大卫 <UNK> <EOS> <PAD> <PAD> <PAD>
Reference: life in the deep oceans <EOS> <PAD> <PAD> <PAD>
Model: <SOS> life life , the the the the are are

Sampling from val predictions...
Source: 我们 都 知道 自己 正 冒 着 生命 的 危险
Reference: we all knew we were risking our lives --
Model: <SOS> clips video the the the the the the the

Epoch: 20.00, Train Loss: 3.07, Val Loss: 13.25, Train BLEU: 0.37, Val BLEU: 0.22
Sampling from training predictions...
Source: 海洋 里 生物 的 多样 多样性 和 密度 要 比
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> clips video the the the the the the the

Sampling from val predictions...
Source: 冬天 很 舒服 但 夏天 却 <UNK> <EOS> <PAD> <PAD>
Reference: it was cozy in winter but extremely hot in
Model: <SOS> life life the the the the the the the

Epoch: 21.00, Train Loss: 3.03, Val Loss: 13.38, Train BLEU: 0.37, Val BLEU: 0.22
Sampling from

Epoch: 38.00, Train Loss: 2.35, Val Loss: 14.24, Train BLEU: 0.40, Val BLEU: 0.29
Sampling from training predictions...
Source: 海洋 里 生物 的 多样 多样性 和 密度 要 比
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> by going truth that the the the the the

Sampling from val predictions...
Source: 我 不知 知道 那 意味 意味着 什么 但是 我 能
Reference: i didn &apos;t know what it meant , but
Model: <SOS> by to truth the the the the the the

Epoch: 39.00, Train Loss: 2.30, Val Loss: 14.31, Train BLEU: 0.40, Val BLEU: 0.29
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> by to truth when the i the the the

Sampling from val predictions...
Source: 我 永远 不会 忘记 那个 早晨 <EOS> <PAD> <PAD> <PAD>
Reference: a morning that i will never forget . <EOS>
Model: <SOS> with &apos;m of of of the the and and

Epoch: 40.00, Train Loss: 2.27, Val Loss: 14.36, Train BLEU: 0.43, Val BLEU: 0.31
Sampling from training predict

Epoch: 57.00, Train Loss: 1.74, Val Loss: 14.68, Train BLEU: 0.38, Val BLEU: 0.26
Sampling from training predictions...
Source: 原因 在于 我们 一直 没 把 海洋 当回事 回事 回事儿
Reference: and the problem , i think , is that
Model: <SOS> oceans . got got oceans oceans oceans oceans oceans

Sampling from val predictions...
Source: 塔利 塔利班 走 了 父亲 大声 叫 着 <EOS> <PAD>
Reference: &quot; the taliban are gone ! &quot; my father
Model: <SOS> with &apos;m , of of <EOS> are are and

Epoch: 58.00, Train Loss: 1.71, Val Loss: 14.71, Train BLEU: 0.38, Val BLEU: 0.26
Sampling from training predictions...
Source: 海洋 里 生物 的 多样 多样性 和 密度 要 比
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> by going truth that the the the the the

Sampling from val predictions...
Source: 我 不知 知道 那 意味 意味着 什么 但是 我 能
Reference: i didn &apos;t know what it meant , but
Model: <SOS> submarines to truth that i the the the the

Epoch: 59.00, Train Loss: 1.68, Val Loss: 14.72, Train BLEU: 0.38, Val BLEU: 0.24
Sampling from traini

Epoch: 76.00, Train Loss: 1.24, Val Loss: 14.99, Train BLEU: 0.45, Val BLEU: 0.22
Sampling from training predictions...
Source: 当 你 站 在 海滩 上 或是 当 你 看到
Reference: part of the problem , i think , is
Model: <SOS> vibrant gallo most i the the the the unexplored

Sampling from val predictions...
Source: 所以 在 那 之后 5 年 我 <UNK> <UNK> 陪
Reference: so for the next five years , i dressed
Model: <SOS> by is the the the the the the the

Epoch: 77.00, Train Loss: 1.22, Val Loss: 15.00, Train BLEU: 0.45, Val BLEU: 0.22
Sampling from training predictions...
Source: 海洋 里 生物 的 多样 多样性 和 密度 要 比
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> by going truth that water the the the the

Sampling from val predictions...
Source: 我 11 岁 那年 记得 得有 一天 早晨 醒来 听见
Reference: when i was 11 , i remember waking up
Model: <SOS> vibrant gallo truth the the the the the the

Epoch: 78.00, Train Loss: 1.20, Val Loss: 15.01, Train BLEU: 0.44, Val BLEU: 0.22
Sampling from training predictions...
Source: 深

Epoch: 95.00, Train Loss: 0.87, Val Loss: 15.23, Train BLEU: 0.46, Val BLEU: 0.22
Sampling from training predictions...
Source: 泰坦 泰坦尼克 泰坦尼克号 坦尼 尼克 号 是 拿 了 不少
Reference: the truth of the matter is that the titanic
Model: <SOS> <EOS> lange &apos;ve titanic the the the the ocean

Sampling from val predictions...
Source: 塔利 塔利班 走 了 父亲 大声 叫 着 <EOS> <PAD>
Reference: &quot; the taliban are gone ! &quot; my father
Model: <SOS> with &apos;m of problem of most there there &apos;s

Epoch: 96.00, Train Loss: 0.85, Val Loss: 15.22, Train BLEU: 0.45, Val BLEU: 0.22
Sampling from training predictions...
Source: 大卫 <UNK> 通过 潜水 潜水艇 拍下 的 影片 把 我们
Reference: with vibrant video clips captured by submarines , david
Model: <SOS> in david you matter think the the in the

Sampling from val predictions...
Source: 一个 真正 的 学校 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: a real school . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> with &apos;m of problem of most are and and

Epoch: 97.00, Train Loss: 0.83, Val Lo

Epoch: 113.00, Train Loss: 0.60, Val Loss: 15.44, Train BLEU: 0.46, Val BLEU: 0.22
Sampling from training predictions...
Source: 其实 实地 地球 上 最长 的 山脉 都 在 海洋
Reference: and in the oceans , there are the longest
Model: <SOS> video and incredible think planet of longest are are

Sampling from val predictions...
Source: 我们 每天 要 走 不同 的 路线 这样 才 没有
Reference: each day , we took a different route so
Model: <SOS> vibrant gallo most i the the the volcanoes unexplored

Epoch: 114.00, Train Loss: 0.59, Val Loss: 15.46, Train BLEU: 0.46, Val BLEU: 0.22
Sampling from training predictions...
Source: 其实 实地 地球 上 最长 的 山脉 都 在 海洋
Reference: and in the oceans , there are the longest
Model: <SOS> video and incredible think planet of longest are are

Sampling from val predictions...
Source: 所以 在 那 之后 5 年 我 <UNK> <UNK> 陪
Reference: so for the next five years , i dressed
Model: <SOS> submarines the the the the the the the the

Epoch: 115.00, Train Loss: 0.57, Val Loss: 15.49, Train BLEU: 0.46, Val BLEU: 0.22
Sam

Epoch: 132.00, Train Loss: 0.38, Val Loss: 15.67, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 海洋 里 生物 的 多样 多样性 和 密度 要 比
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> by going truth that water , the the there

Sampling from val predictions...
Source: 塔利 塔利班 走 了 父亲 大声 叫 着 <EOS> <PAD>
Reference: &quot; the taliban are gone ! &quot; my father
Model: <SOS> with &apos;m of problem of . there there &apos;s

Epoch: 133.00, Train Loss: 0.37, Val Loss: 15.68, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 大多 大多数 多数 地震 和 火山 喷发 也 都 发生
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> captured &apos;re the is ocean problem of the yet

Sampling from val predictions...
Source: 我们 把 书 放在 食品 杂货 袋中 这样 别人 就
Reference: we would cover our books in grocery bags so
Model: <SOS> submarines to of that . the the the are

Epoch: 134.00, Train Loss: 0.36, Val Loss: 15.67, Train BLEU: 0.60, Val BLEU: 

Epoch: 150.00, Train Loss: 0.25, Val Loss: 15.86, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 其实 实地 地球 上 最长 的 山脉 都 在 海洋
Reference: and in the oceans , there are the longest
Model: <SOS> video and incredible think planet of longest are ,

Sampling from val predictions...
Source: 冬天 很 舒服 但 夏天 却 <UNK> <EOS> <PAD> <PAD>
Reference: it was cozy in winter but extremely hot in
Model: <SOS> life , tell the you <EOS> think are are

Epoch: 151.00, Train Loss: 0.24, Val Loss: 15.89, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 大卫 <UNK> 这位 是 比尔 <UNK> 我 是 大卫 <UNK>
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> the this some is about average is the <UNK>

Sampling from val predictions...
Source: 这 是 我们 俩 人 唯一 的 受教 教育 方式
Reference: it was the only way we both could be
Model: <SOS> deep is stories that the the the in and

Epoch: 152.00, Train Loss: 0.23, Val Loss: 15.92, Train BLEU: 0.60, Val BLEU: 0.22
Sampling fro

Epoch: 168.00, Train Loss: 0.15, Val Loss: 16.06, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 原因 在于 我们 一直 没 把 海洋 当回事 回事 回事儿
Reference: and the problem , i think , is that
Model: <SOS> . . got and oceans two oceans of is

Sampling from val predictions...
Source: 一个 真正 的 学校 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: a real school . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> with &apos;m of problem most there are and and

Epoch: 169.00, Train Loss: 0.15, Val Loss: 16.08, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 海洋 里 生物 的 多样 多样性 和 密度 要 比
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> by going truth that water , the the there

Sampling from val predictions...
Source: 一个 真正 的 学校 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: a real school . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> with &apos;m of problem most there are and and

Epoch: 170.00, Train Loss: 0.15, Val Loss: 16.10, Train BLEU: 0

Epoch: 187.00, Train Loss: 0.10, Val Loss: 16.23, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> submarines to of when . i animals biodiversity are

Sampling from val predictions...
Source: 我们 每天 要 走 不同 的 路线 这样 才 没有
Reference: each day , we took a different route so
Model: <SOS> vibrant gallo most i the part part volcanoes unexplored

Epoch: 188.00, Train Loss: 0.10, Val Loss: 16.23, Train BLEU: 0.60, Val BLEU: 0.22
Sampling from training predictions...
Source: 这儿 基本 基本上 都 没有 被 开发 发过 但是 像
Reference: it &apos;s mostly unexplored , and yet there are
Model: <SOS> submarines to of when . i animals biodiversity are

Sampling from val predictions...
Source: 一个 真正 的 学校 <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference: a real school . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> with &apos;m of problem most there are and and

Epoch: 189.00, Train Loss: 0.09, Val Loss:

In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(full_loaders['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 

In [None]:
summarize_results(load_experiment_log())

In [None]:
plot_single_learning_curve(results)