In [1]:
import numpy as np 
import pandas as pd 
import torch
from data_processing import generate_vocab, process_data, create_dataloaders
from model import get_pretrained_emb, EncoderRNN, DecoderRNN, DecoderAttnRNN, EncoderDecoder, EncoderDecoderAttn
from train_eval import train_and_eval, count_parameters, summarize_results, plot_single_learning_curve, load_experiment_log
import pickle as pkl 
from datetime import datetime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# params required for generating data loaders 

SRC_LANG = 'vi'
TARG_LANG = 'en'

SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
SRC_VOCAB_SIZE = 30000 
TARG_VOCAB_SIZE = 30000 

BATCH_SIZE = 64 

In [3]:
#takes a long time to process, save to pickle for reimport in future 
#vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
#vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
#pkl.dump(vocab, open(vocab_filename, "wb"))

In [4]:
# reload from pickle 
vocab_filename = "{}-{}-vocab.p".format(SRC_LANG, TARG_LANG)
vocab = pkl.load(open(vocab_filename, "rb"))
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, filter_long=False)
data_minibatch = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=BATCH_SIZE, filter_long=False) 
data_minitrain = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab, sample_limit=1000, filter_long=False)

In [5]:
# create dataloaders 
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minibatch = create_dataloaders(data_minibatch, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
loaders_minitrain = create_dataloaders(data_minitrain, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [6]:
# model architecture params 
NETWORK_TYPE = 'cnn'
RNN_CELL_TYPE = 'NA'
NUM_LAYERS = 1
ENC_HIDDEN_DIM = 512
DEC_HIDDEN_DIM = ENC_HIDDEN_DIM 
TEACHER_FORCING_RATIO = 1
CLIP_GRAD_MAX_NORM = 1
ENC_DROPOUT = 0.2 
DEC_DROPOUT = 0.2  
ATTENTION_TYPE = 'without'

# training params  
NUM_EPOCHS = 15 
LR = 0.0001 
OPTIMIZER = 'Adam'
LAZY_TRAIN = False

# name the model and experiment 
if NETWORK_TYPE == 'rnn': 
    EXPERIMENT_NAME = '{}-rnn-{}-attn'.format(SRC_LANG, ATTENTION_TYPE)
elif NETWORK_TYPE == 'cnn': 
    EXPERIMENT_NAME = '{}-cnn'.format(SRC_LANG)
MODEL_NAME = '{}-{}'.format(EXPERIMENT_NAME, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

In [7]:
# store as dict to save to results later 
params = {'experiment_name': EXPERIMENT_NAME,'model_name': MODEL_NAME, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 
          'rnn_cell_type': RNN_CELL_TYPE, 'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 
          'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN, 'src_vocab_size': SRC_VOCAB_SIZE, 
          'targ_vocab_size': TARG_VOCAB_SIZE, 'num_layers': NUM_LAYERS, 'enc_hidden_dim': ENC_HIDDEN_DIM, 
          'dec_hidden_dim': DEC_HIDDEN_DIM, 'teacher_forcing_ratio': TEACHER_FORCING_RATIO, 
          'clip_grad_max_norm': CLIP_GRAD_MAX_NORM, 'enc_dropout': ENC_DROPOUT, 'dec_dropout': DEC_DROPOUT, 
          'attention_type': ATTENTION_TYPE, 'batch_size': BATCH_SIZE, 'num_epochs': NUM_EPOCHS, 
          'learning_rate': LR, 'optimizer': OPTIMIZER, 'lazy_train': LAZY_TRAIN} 

In [114]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderCNN(nn.Module):
    
    def __init__(self, pretrained_word2vec, src_max_sentence_len=10, enc_hidden_dim=512, dropout=0.1):
        super(EncoderCNN, self).__init__()
        self.enc_embed_dim = 300
        self.enc_hidden_dim = enc_hidden_dim
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True).to(device)
        self.conv1_a = nn.Conv1d(300, enc_hidden_dim, kernel_size=3, padding=1).to(device)
        self.conv2_a = nn.Conv1d(enc_hidden_dim, enc_hidden_dim, kernel_size=3, padding=1).to(device)
        self.dropout_val = dropout
        self.src_max_sentence_len = src_max_sentence_len
        self.linearout = nn.Linear(enc_hidden_dim,300)
        self.linear_for_hidden = nn.Linear(300 * 10, self.enc_hidden_dim)
 

        
    def forward(self, enc_input, enc_input_lens):
        enc_input = enc_input.to(device)
        enc_input_lens = enc_input_lens.to(device)
        batch_size = enc_input.size()[0]
        embedded = self.embedding(enc_input)
        embedded = F.dropout(embedded, self.dropout_val)
        
        # 1st net
        hidden_1_a = self.conv1_a(embedded.transpose(1,2)).transpose(1,2)
        #print(hidden_1_a.shape)
        #hidden_1_a.contiguous().view(-1, hidden_1_a.size(-1))
        hidden_1_a = torch.tanh(hidden_1_a.contiguous()).view(batch_size, -1, hidden_1_a.size(-1))
        hidden_2_a = self.conv2_a(hidden_1_a.transpose(1,2)).transpose(1,2)
        hidden_2_a = torch.tanh(hidden_2_a.contiguous().view(
                                                    batch_size, -1, hidden_2_a.size(-1)))
        #print(hidden_2_a.transpose(1,2).shape)
        hidden_2_a = self.linearout(hidden_2_a)
        #print(hidden_2_a.shape)
        dim_1_hidden = self.linear_for_hidden(hidden_2_a.view(batch_size,1, -1))
        
        return hidden_2_a, dim_1_hidden

In [115]:
class EncoderCNN2(nn.Module):
    
    def __init__(self, pretrained_word2vec, src_max_sentence_len=10, enc_hidden_dim=512, dropout=0.1):
        super(EncoderCNN2, self).__init__()
        self.enc_embed_dim = 300
        self.enc_hidden_dim = enc_hidden_dim
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True).to(device)
        self.conv1_a = nn.Conv1d(300*SRC_MAX_SENTENCE_LEN, enc_hidden_dim, kernel_size=3, padding=1, stride=300).to(device)
        self.conv2_a = nn.Conv1d(enc_hidden_dim, enc_hidden_dim, kernel_size=3, padding=1, stride=300).to(device)
        self.dropout_val = dropout
        self.src_max_sentence_len = src_max_sentence_len
        self.linearout = nn.Linear(enc_hidden_dim,3000)
        self.linear_for_hidden = nn.Linear(3000, self.enc_hidden_dim)
 

        
    def forward(self, enc_input, enc_input_lens):
        enc_input = enc_input.to(device)
        enc_input_lens = enc_input_lens.to(device)
        batch_size = enc_input.size()[0]
        embedded = self.embedding(enc_input)
        embedded = F.dropout(embedded, self.dropout_val)
        embedded = embedded.view(batch_size, -1, 1)
        
        # 1st net
        hidden_1_a = self.conv1_a(embedded)
        #print(hidden_1_a.shape)
        #hidden_1_a.contiguous().view(-1, hidden_1_a.size(-1))
        hidden_1_a = torch.tanh(hidden_1_a.contiguous()).view(batch_size, -1, hidden_1_a.size(-1))
        hidden_2_a = self.conv2_a(hidden_1_a)
        hidden_2_a = torch.tanh(hidden_2_a.contiguous().view(
                                                    batch_size, -1, hidden_2_a.size(-1)))
        #print(hidden_2_a.transpose(1,2).shape)
        #print(hidden_2_a.shape)
        hidden_2_a = self.linearout(hidden_2_a.transpose(1,2)).view(batch_size, -1,self.enc_embed_dim)
        #print(hidden_2_a.shape)
        dim_1_hidden = self.linear_for_hidden(hidden_2_a.view(batch_size,1, -1))
        #print(dim_1_hidden.shape)
        #print('output {}'.format(hidden_2_a.shape))
        #print('hidden {}'.format(dim_1_hidden.shape))
        
        return hidden_2_a, dim_1_hidden

In [116]:
class Decoder_RNN_from_CNN(nn.Module):
    """ Vanilla decoder without attention, but final layer from encoder is repeatedly passed as input to each time step. 
        Handles output from EncoderRNN, which concats bidirectional output. 
    """ 

    def __init__(self, dec_hidden_dim, enc_hidden_dim, num_layers, targ_vocab_size, targ_max_sentence_len, pretrained_word2vec, batch_size):
        super(Decoder_RNN_from_CNN, self).__init__()
        self.dec_embed_dim = 300
        self.dec_hidden_dim = dec_hidden_dim 
        self.enc_hidden_dim = enc_hidden_dim
        self.targ_vocab_size = targ_vocab_size
        self.batch_size = batch_size
        self.targ_max_sentence_len = targ_max_sentence_len
        self.num_layers = num_layers
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True) 
        self.gru = nn.GRU(300 + self.enc_hidden_dim, self.dec_hidden_dim, num_layers=self.num_layers) 
        self.out = nn.Linear(dec_hidden_dim, self.targ_vocab_size) 
        self.softmax = nn.LogSoftmax(dim=1) 

    def forward(self, dec_input, context, dec_hidden, enc_outputs):  
        
        batch_size = dec_input.size()[0]
        dec_hidden = dec_hidden.view(1, batch_size, -1)
        embedded = self.embedding(dec_input).view(1, batch_size, -1)   
        #print(embedded.shape)
        #context = torch.cat([enc_outputs[:, -1, :self.enc_hidden_dim], 
        #                     enc_outputs[:, 0, self.enc_hidden_dim:]], dim=1).unsqueeze(0)
        context = context.view(1, batch_size, -1) 
        #print(context.shape)
        concat = torch.cat([embedded, context], 2)
        output, hidden = self.gru(concat, dec_hidden)
        output = self.softmax(self.out(output[0]))  
        return output, hidden

In [117]:
class CNN_RNN_EncoderDecoder(nn.Module): 

    """ Encoder-Decoder without attention """

    def __init__(self, encoder, decoder, decoder_token2id): 
        super(CNN_RNN_EncoderDecoder, self).__init__() 
        self.encoder = encoder 
        self.decoder = decoder 
        self.targ_vocab_size = self.decoder.targ_vocab_size
        self.src_max_sentence_len = self.encoder.src_max_sentence_len 
        self.targ_max_sentence_len = self.decoder.targ_max_sentence_len

    def forward(self, src_idx, targ_idx, src_lens, targ_lens, teacher_forcing_ratio): 
        
        batch_size = src_idx.size()[0]
        enc_outputs, enc_hidden = self.encoder(src_idx, src_lens)
        dec_hidden = enc_hidden 
        dec_outputs = Variable(torch.zeros(self.targ_max_sentence_len, batch_size, self.targ_vocab_size))
        hypotheses = Variable(torch.zeros(self.targ_max_sentence_len, batch_size))
        dec_output = targ_idx[:, 0] 

        for di in range(1, self.targ_max_sentence_len): 
            dec_output, dec_hidden = self.decoder(dec_output, dec_hidden, dec_hidden, enc_outputs)
            dec_outputs[di] = dec_output 
            teacher_labels = targ_idx[:, di-1] 
            greedy_labels = dec_output.data.max(1)[1]
            dec_output = teacher_labels if random.random() < teacher_forcing_ratio else greedy_labels 
            hypotheses[di] = greedy_labels

        attn_placeholder = Variable(torch.zeros(batch_size, self.targ_max_sentence_len, self.src_max_sentence_len))

        return dec_outputs, hypotheses.transpose(0,1), attn_placeholder 

In [120]:
# define model 
encoder = EncoderCNN2(pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))

if ATTENTION_TYPE == 'without': 
    # without attention 
    decoder =  Decoder_RNN_from_CNN(dec_hidden_dim=DEC_HIDDEN_DIM, enc_hidden_dim=ENC_HIDDEN_DIM, num_layers=NUM_LAYERS,
                         targ_vocab_size=TARG_VOCAB_SIZE, targ_max_sentence_len=TARG_MAX_SENTENCE_LEN, batch_size=BATCH_SIZE, 
                         pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
    model = CNN_RNN_EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id']).to(device)

In [None]:
model, results = train_and_eval(
    model=model, loaders_full=loaders_full, loaders_minibatch=loaders_minibatch, loaders_minitrain=loaders_minitrain, 
    params=params, vocab=vocab, print_intermediate=100, save_checkpoint=True, save_to_log=True, 
    lazy_eval=True, print_attn=False, inspect_samples=1)

Epoch: 0.00, Train Loss: 0.00, Val Loss: 10.30, Train BLEU: 0.00, Val BLEU: 0.01, Minutes Elapsed: 0.08
Sampling from val predictions...
Source: bây_giờ tôi muốn giới_thiệu các bạn với những người em_trai
Reference: now i &apos;d like to introduce you to my
Model: <SOS> thump thump crimes camels ellipses ellipses citizenship citizenship citizenship



In [None]:
experiment_results = load_experiment_log(experiment_name=EXPERIMENT_NAME)

In [None]:
plot_single_learning_curve(experiment_results[0]['results'])

In [None]:
summarize_results(experiment_results)[['best_val_loss', 'best_val_bleu', 'runtime', 
                                       'total_params', 'trainable_params', 'dt_created']]

In [None]:
# reload model and test 
checkpoint = torch.load('model_checkpoints/{}.pth.tar'.format(MODEL_NAME), map_location=device)
model.load_state_dict(checkpoint)