In [1]:
import numpy as np 
import pandas as pd 
import torch
from data_processing import generate_vocab, process_data, create_dataloaders
from model import get_pretrained_emb, EncoderRNN, DecoderRNN, DecoderAttnRNN, EncoderDecoder, EncoderDecoderAttn, EncoderCNN, EncoderCNN2, Decoder_RNN_from_CNN, CNN_RNN_EncoderDecoder 
from train_eval import evaluate, train_and_eval, summarize_results, plot_single_learning_curve, load_experiment_log
import pickle as pkl 
from datetime import datetime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SRC_LANG = 'vi'
TARG_LANG = 'en'

SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 
TARG_VOCAB_SIZE = 30000 

BATCH_SIZE = 64 

# takes a long time to process, save to pickle for reimport in future 
#vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
#vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
#pkl.dump(vocab, open(vocab_filename, "wb"))

# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, filter_long=False)
data_minibatch = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=BATCH_SIZE, filter_long=False) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=1000, filter_long=False)

# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

# model architecture params 
NETWORK_TYPE = 'cnn'
RNN_CELL_TYPE = 'gru'
NUM_LAYERS = 1 
ENC_HIDDEN_DIM = 512
DEC_HIDDEN_DIM = ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0 #0.2 
DEC_DROPOUT = 0 #0.2 
ATTENTION_TYPE = 'additive'

# training params  
NUM_EPOCHS = 10 #5
LR = 0.0003 # 0.0005
OPTIMIZER = 'Adam'
LAZY_TRAIN = False

# name the model and experiment 
EXPERIMENT_NAME = 'vi_final'
if NETWORK_TYPE == 'rnn': 
    MODEL_NAME = '{}-rnn-{}-attn'.format(SRC_LANG, ATTENTION_TYPE)
elif NETWORK_TYPE == 'cnn': 
    MODEL_NAME = '{}-cnn'.format(SRC_LANG)

# store as dict to save to results later 
params = {'experiment_name': EXPERIMENT_NAME,'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 
          'rnn_cell_type': RNN_CELL_TYPE, 'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 
          'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 'src_vocab_size': SRC_VOCAB_SIZE, 
          'targ_vocab_size': TARG_VOCAB_SIZE, 'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 
          'dec_hidden_dim': DEC_HIDDEN_DIM, 'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 
          'clip_grad_max_norm': CLIP_GRAD_MAX_NORM, 'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 
          'attention_type': ATTENTION_TYPE, 'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 
          'learning_rate': LR, 'optimizer': OPTIMIZER, 'lazy_train': LAZY_TRAIN} 


# instantiate model 

encoder = EncoderCNN(pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']), 
                      src_max_sentence_len=10, dropout=0, enc_hidden_dim=params['enc_hidden_dim'])


decoder =  Decoder_RNN_from_CNN(dec_hidden_dim=params['dec_hidden_dim'], enc_hidden_dim=params['enc_hidden_dim'], num_layers=NUM_LAYERS,
                     targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, batch_size=BATCH_SIZE, 
                     pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
model = CNN_RNN_EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id']).to(device)

In [2]:
MODEL_NAME_TO_RELOAD = 'vi-cnn'
checkpoint = torch.load('model_checkpoints/{}.pth.tar'.format(MODEL_NAME_TO_RELOAD), map_location=device)
model.load_state_dict(checkpoint)

In [None]:
experiment_results = load_experiment_log(experiment_name=EXPERIMENT_NAME)
summarize_results(experiment_results)[['model_name', 'best_val_loss', 'best_val_bleu', 'runtime', 
                                          'total_params', 'trainable_params', 'dt_created']].head(1)

In [3]:
# check performance on validation set 
val_loss, val_bleu, val_hyp_idxs, val_ref_idxs, val_source_idxs, val_hyp_tokens, val_ref_tokens, val_source_tokens,\
val_attn = evaluate(model=model, loader=loaders_full['dev'], 
                    src_id2token=vocab[SRC_LANG]['id2token'], targ_id2token=vocab[TARG_LANG]['id2token'])
print("Validation BLEU: {:.2f} | Validation Loss: {:.2f}".format(val_bleu, val_loss))



Validation BLEU: 9.69 | Validation Loss: 5.56


In [4]:
# print predictions on val data 
for source, ref, hyp in zip(val_source_tokens, val_ref_tokens, val_hyp_tokens): 
    print("SOURCE: {}".format(' '.join(source)))
    print("REFERENCE: {}".format(' '.join(ref)))
    print("HYPOTHESIS: {}".format(' '.join(hyp)))
    print()

SOURCE: khi tôi còn nhỏ , tôi nghĩ rằng <UNK> tiên
REFERENCE: when i was little , i thought my country
HYPOTHESIS: <SOS> when i was a , , i to to

SOURCE: tôi đã rất tự_hào về đất_nước tôi . <EOS> <PAD>
REFERENCE: and i was very proud . <EOS> <PAD> <PAD>
HYPOTHESIS: <SOS> i was was very to to i . .

SOURCE: ở trường , chúng_tôi dành rất nhiều thời_gian để học
REFERENCE: in school , we spent a lot of time
HYPOTHESIS: <SOS> in the , we least a time of time

SOURCE: mặc_dù tôi đã từng tự_hỏi không biết thế_giới bên_ngoài kia
REFERENCE: although i often wondered about the outside world ,
HYPOTHESIS: <SOS> now gotten i been the , , , ,

SOURCE: khi tôi lên 7 , tôi chứng_kiến cảnh người_ta <UNK>
REFERENCE: when i was seven years old , i saw
HYPOTHESIS: <SOS> when i was a , , , i was

SOURCE: gia_đình của tôi không nghèo , và bản_thân tôi thì
REFERENCE: my family was not poor , and myself ,
HYPOTHESIS: <SOS> my ready was &apos;t , , i i &apos;m

SOURCE: nhưng vào một ngày của năm 1995 , mẹ tô


SOURCE: " trước_khi tôi chết , tôi muốn được hát cho
REFERENCE: &quot; before i die , i want to sing
HYPOTHESIS: <SOS> &quot; i i i , i i to call

SOURCE: " trước_khi tôi chết , tôi muốn trồng một cái
REFERENCE: &quot; before i die , i want to plant
HYPOTHESIS: <SOS> &quot; before i i me i i to to

SOURCE: " trước_khi tôi chết , tôi muốn sống ngoài khuôn_phép
REFERENCE: &quot; before i die , i want to live
HYPOTHESIS: <SOS> &quot; i i i to i &apos;m to to

SOURCE: " trước_khi tôi chết , tôi muốn được ôm cô
REFERENCE: &quot; before i die , i want to hold
HYPOTHESIS: <SOS> &quot; i i i to i i to be

SOURCE: " trước_khi tôi chết , tôi muốn thành kị_binh của
REFERENCE: &quot; before i die , i want to be
HYPOTHESIS: <SOS> &quot; i i i , i i to leave

SOURCE: " trước_khi tôi chết , tôi muốn hoàn_toàn là chính
REFERENCE: &quot; before i die , i want to be
HYPOTHESIS: <SOS> &quot; i i i to i want to know

SOURCE: vậy không_gian bị bỏ_bê này đã trở_thành một không_gian kiến_tạo
REFERENCE: so t

In [5]:
# check performance on test set 
test_loss, test_bleu, test_hyp_idxs, test_ref_idxs, test_source_idxs, test_hyp_tokens, test_ref_tokens, test_source_tokens,\
test_attn = evaluate(model=model, loader=loaders_full['test'], 
                     src_id2token=vocab[SRC_LANG]['id2token'], targ_id2token=vocab[TARG_LANG]['id2token'])
print("Test BLEU: {:.2f} | Test Loss: {:.2f}".format(test_bleu, test_loss))



Test BLEU: 10.81 | Test Loss: 5.36
