In [1]:
import numpy as np 
import pandas as pd 
from data_processing import generate_vocab, process_data, create_dataloaders 
from model import get_pretrained_emb, EncoderDecoder, EncoderRNN, DecoderRNN, DecoderSimpleRNN, \
    Attention, DecoderAttnRNN
from train_eval import train_and_eval, inspect_model, count_parameters, summarize_results, \
    plot_single_learning_curve, load_experiment_log
from train_eval import train_and_eval_V2, tensor2corpus_V2
import importlib
import pickle as pkl 

In [2]:
# model identification
MODEL_NAME = 'test_model'
SRC_LANG = 'zh'
TARG_LANG = 'en'

# data processing params  
SRC_MAX_SENTENCE_LEN = 10 
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000
TARG_VOCAB_SIZE = 30000

# model architecture params 
NUM_LAYERS = 1 #2 
ENC_HIDDEN_DIM = 300 
DEC_HIDDEN_DIM = 2 * ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 0.5
CLIP_GRAD_MAX_NORM = 10
ENC_DROPOUT = 0 # to actually implement
DEC_DROPOUT = 0 # to actually implement

# training params  
BATCH_SIZE = 16 #32
NUM_EPOCHS = 200
LR = 0.001 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = True 

In [3]:
# store as dict to save to results later 
params = {'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 
          'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 
          'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
          'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 'dec_hidden_dim': DEC_HIDDEN_DIM,
          'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 'clip_grad_max_norm': CLIP_GRAD_MAX_NORM,
          'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 
          'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 'learning_rate': LR, 'optimizer': OPTIMIZER, 
          'lazy_train': LAZY_TRAIN} 

In [4]:
# # takes a long time to process, save to pickle for reimport in future 
# vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
# pkl.dump(vocab, open(vocab_filename, "wb"))

In [5]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, vocab)
limited_data = process_data(SRC_LANG, TARG_LANG, vocab, sample_limit=BATCH_SIZE) 

In [6]:
# create dataloaders 
full_loaders = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
fast_loaders = create_dataloaders(limited_data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [7]:
# define model 

encoder = EncoderRNN(enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, src_max_sentence_len=SRC_MAX_SENTENCE_LEN,
                     pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
decoder = DecoderSimpleRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
                           targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
                           pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
# decoder = DecoderAttnRNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS, 
#                          targ_vocab_size=TARG_VOCAB_SIZE, src_max_sentence_len=SRC_MAX_SENTENCE_LEN, 
#                          targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, 
#                          pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id']) 

In [None]:
model, results = train_and_eval_V2(
    model=model, full_loaders=full_loaders, fast_loaders=fast_loaders, params=params, vocab=vocab, 
    print_intermediate=True, save_checkpoint=True, lazy_eval=False, inspect_iter=100, save_to_log=True, print_summary=True)

Epoch: 0.00, Train Loss: 9.90, Val Loss: 10.21, Train BLEU: 0.49, Val BLEU: 0.22
Sampling from training predictions...
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> submarines submarines is the the the the the the

Sampling from val predictions...
Reference: it was cozy in winter but extremely hot in
Model: <SOS> life submarines the the the the the the the

Epoch: 1.00, Train Loss: 9.33, Val Loss: 10.07, Train BLEU: 0.39, Val BLEU: 0.22
Sampling from training predictions...
Reference: most of the planet is ocean water . <EOS>
Model: <SOS> life life the the the the the the the

Sampling from val predictions...
Reference: there was a big smile on his face which
Model: <SOS> submarines submarines the the the the the the the

Epoch: 2.00, Train Loss: 8.38, Val Loss: 9.82, Train BLEU: 0.33, Val BLEU: 0.22
Sampling from training predictions...
Reference: with vibrant video clips captured by submarines , david
Model: <SOS> the the the the the the the the the

Sampling from

Epoch: 23.00, Train Loss: 3.07, Val Loss: 13.98, Train BLEU: 0.43, Val BLEU: 0.22
Sampling from training predictions...
Reference: most of the animals are in the oceans .
Model: <SOS> video video the the the the the the the

Sampling from val predictions...
Reference: we would cover our books in grocery bags so
Model: <SOS> video video the the the the the the the

Epoch: 24.00, Train Loss: 3.03, Val Loss: 14.08, Train BLEU: 0.34, Val BLEU: 0.22
Sampling from training predictions...
Reference: this is bill lange . i &apos;m dave gallo
Model: <SOS> the the the the the the the the the

Sampling from val predictions...
Reference: so for the next five years , i dressed
Model: <SOS> video video the the the the the the the

Epoch: 25.00, Train Loss: 2.99, Val Loss: 14.13, Train BLEU: 0.32, Val BLEU: 0.22
Sampling from training predictions...
Reference: most of the animals are in the oceans .
Model: <SOS> video video the the the the the the the

Sampling from val predictions...
Reference: &quo

Epoch: 46.00, Train Loss: 2.20, Val Loss: 14.94, Train BLEU: 0.32, Val BLEU: 0.25
Sampling from training predictions...
Reference: the biodiversity and the <UNK> in the ocean is
Model: <SOS> by going truth that the the the the the

Sampling from val predictions...
Reference: my father was listening to bbc news on his
Model: <SOS> vibrant to that that the the the the the

Epoch: 47.00, Train Loss: 2.17, Val Loss: 14.95, Train BLEU: 0.32, Val BLEU: 0.24
Sampling from training predictions...
Reference: and in the oceans , there are the longest
Model: <SOS> video video incredible that the the the the the

Sampling from val predictions...
Reference: a real school . <EOS> <PAD> <PAD> <PAD> <PAD>
Model: <SOS> with &apos;m of of of of are and and

Epoch: 48.00, Train Loss: 2.13, Val Loss: 14.99, Train BLEU: 0.34, Val BLEU: 0.23
Sampling from training predictions...
Reference: and the problem , i think , is that
Model: <SOS> . . got got oceans oceans oceans oceans oceans

Sampling from val pred

Epoch: 69.00, Train Loss: 1.57, Val Loss: 15.47, Train BLEU: 0.40, Val BLEU: 0.24
Sampling from training predictions...
Reference: most of the earthquakes and volcanoes are in the
Model: <SOS> captured &apos;re the the the the the the the

Sampling from val predictions...
Reference: my father was listening to bbc news on his
Model: <SOS> vibrant gallo i i the the the the the

Epoch: 70.00, Train Loss: 1.54, Val Loss: 15.53, Train BLEU: 0.40, Val BLEU: 0.24
Sampling from training predictions...
Reference: with vibrant video clips captured by submarines , david
Model: <SOS> in david you matter think the the the the

Sampling from val predictions...
Reference: each day , we took a different route so
Model: <SOS> vibrant gallo stories that the the the the the



In [None]:
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(full_loaders['train']):
#     print(i)
#     print(src_idxs.size())
#     print(src_idxs)
#     print(src_lens)
#     print(targ_idxs.size())
#     print(targ_idxs)
#     print(targ_lens)
    id2token = vocab[SRC_LANG]['id2token']
    test_tensor = src_idxs
    list_of_lists = test_tensor.numpy().astype(int).tolist()
    to_token = lambda l: ' '.join([id2token[idx] for idx in l])
    list_of_lists_tokens = [to_token(l) for l in list_of_lists] 
    break 

In [None]:
print(test_tensor)
print(list_of_lists)
print(list_of_lists_tokens)

In [None]:
summarize_results(load_experiment_log())

In [None]:
plot_single_learning_curve(results)