# Mixture of Softmaxes (RNN LM)

In [1]:
import os, sys
sys.path.append("./mos/")
import time
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# from torch.autograd import Variable

import gc

import mos_data as data
import model as m

from utils import batchify, get_batch, repackage_hidden, create_exp_dir, save_checkpoint

In [2]:
def logging(s, print_=True, log_=True):
    if print_:
        print(s)
    if log_:
        with open(os.path.join(args.save, 'log.txt'), 'a+') as f_log:
            f_log.write(s + '\n')

In [3]:
# Set the random seed manually for reproducibility.
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
is_cuda = torch.cuda.is_available()
if is_cuda:
    torch.cuda.manual_seed_all(seed)

In [4]:
# Load data
datafile = "./data/penn/"
train_batch_size = 12
eval_batch_size = 10
test_batch_size = 1
corpus = data.Corpus(datafile)
ntokens = len(corpus.dictionary)

train_data = batchify(corpus.train, train_batch_size, is_cuda)
val_data = batchify(corpus.valid, eval_batch_size, is_cuda)
test_data = batchify(corpus.test, test_batch_size, is_cuda)

Size of generated data = torch.Size([77465, 12])
Size of generated data = torch.Size([7376, 10])
Size of generated data = torch.Size([82430, 1])


In [5]:
ntokens

10000

In [6]:
# Build model

ntokens = len(corpus.dictionary)
is_keep_training = False
path2saved_model = ""
# Use parameters from first example in original repository
# python main.py --data data/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 
# --lr 20.0 --epoch 1000 --nhid 960 --nhidlast 620 --emsize 280 --n_experts 15 --save PTB --single_gpu

# Type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, SRU)
model_type = "LSTM"
# Size of embedding dimension
emsize = 280
# Number of hidden units per every RNN layer except the last one
nhid = 960
# Number of hidden units for the last RNN layer
nhidlast = 620
# Number of RNN layers
nlayers = 3
# Dropout after the last RNN layer
dropout = 0.3 # default
# Dropout for RNN layers
dropouth = 0.225
# Dropout for input embedding layers
dropouti = 0.4
# Dropout to remove words from embedding layer
dropoute = 0.1 # default
# Dropout for latent representation, before decoding
dropoutl = 0.29
# Amount of weight dropout to apply to the RNN hidden to hidden matrix
# Strange dropout
wdrop = 0.5 # default
# Tie the word embedding and softmax weights
tied = False
# Number of softmaxes to mix
n_experts = 15

if is_keep_training:
    model = torch.load(os.path.join(path2saved_model, 'model.pt'))
else:
    model = m.RNNModel(model_type, ntokens, emsize, nhid, nhidlast, nlayers, 
                       dropout, dropouth, dropouti, dropoute, wdrop, 
                       tied, dropoutl, n_experts)

if torch.cuda.is_available():
    model.cuda()
total_params = sum(x.data.nelement() for x in model.parameters())
# logging('Args: {}'.format(args))
logging('Model total parameters: {}'.format(total_params), log_=False)

Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Param size: 24300620
Model total parameters: 24300620


In [7]:
# Evaluate model
# seq_lenght is strange parameter
def evaluate(data_source, model, ntokens, batch_size, seq_lenght):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, seq_lenght):
        data, targets = get_batch(data_source, i, seq_lenght, evaluation=True)
        targets = targets.view(-1)

        log_prob, hidden = model(data, hidden)
        loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data

        total_loss += loss * len(data)

        hidden = repackage_hidden(hidden)
    return total_loss.item() / len(data_source)

In [8]:
# Set parameters of training
batch_size = 12
# The batch size for computation. batch_size should be divisible by small_batch_size
# In our implementation, we compute gradients with small_batch_size multiple times, and accumulate the gradients\
# until batch_size is reached. An update step is then performed.
small_batch_size = batch_size
# Gradient clipping
clip = 0.25 # default
# Regularization weight on RNN activations
alpha = 2 # default
# Sequence lenght
bptt = 70 # default
# Max sequence length delta
max_seq_len_delta = 40 # default
# Interval to print loss
log_interval = 200 # default
# Use logfile
is_logfile = False

In [9]:
# Train model for single epoch
def train(model, train_data, optimizer, ntokens, batch_size, small_batch_size, bptt0):
    assert batch_size % small_batch_size == 0, 'batch_size must be divisible by small_batch_size'

    # Turn on training mode which enables dropout.
    total_loss = 0
    start_time = time.time()
    hidden = [model.init_hidden(small_batch_size) for _ in range(batch_size // small_batch_size)]
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = bptt0 if np.random.random() < 0.95 else bptt0 / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5))) # loc 70, scale 5
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        seq_len = min(seq_len, bptt + max_seq_len_delta)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / bptt
        model.train()
        data, targets = get_batch(train_data, i, seq_len=seq_len)
    
        optimizer.zero_grad()

        start, end, s_id = 0, small_batch_size, 0
        while start < batch_size:
            cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1)

            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            hidden[s_id] = repackage_hidden(hidden[s_id])
            log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = model(cur_data.cuda(), hidden[s_id], return_h=True)
            raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets)

            loss = raw_loss
            # Activation Regularization
            loss = loss + sum(alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
            # Temporal activation Regularization (slowness)
            loss = loss + sum(beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
            loss *= small_batch_size / batch_size
            total_loss += raw_loss.data * small_batch_size / batch_size
            loss.backward()

            s_id += 1
            start = end
            end = start + small_batch_size

            gc.collect()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        # total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss.item() / log_interval
            elapsed = time.time() - start_time
            logging('| epoch {:3d} | {}/{} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt0, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)), log_=is_logfile)
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len

In [10]:
# Optimizer parameters
# Learning rate
lr = 20
# Weight decay applied to all weights
wdecay = 1.2e-6
# Numbr of epochs
num_epoch = 100
# Beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)
beta = 1
epoch = 1

In [11]:
best_val_loss = []
stored_loss = 100000000
exp_dir = '{}-{}'.format("PTB", time.strftime("%Y%m%d-%H%M%S"))
create_exp_dir(exp_dir)

Experiment dir : PTB-20180524-062133


In [None]:
try:
    optimizer = torch.optim.SGD(parallel_model.parameters(), lr=lr, weight_decay=wdecay)
    while epoch < num_epoch:
        epoch_start_time = time.time()
        
        train(parallel_model, train_data, optimizer, ntokens, batch_size, small_batch_size, bptt)     
        val_loss = evaluate(val_data, parallel_model, ntokens, eval_batch_size, bptt)
        logging('-' * 89, log_=is_logfile)
        logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)), log_=is_logfile)
        logging('-' * 89, log_=is_logfile)

        if val_loss < stored_loss:
            save_checkpoint(parallel_model, optimizer, exp_dir)
            logging('Saving Normal!', log_=is_logfile)
            stored_loss = val_loss
        best_val_loss.append(val_loss)
        epoch += 1

except KeyboardInterrupt:
    logging('-' * 89, log_=is_logfile)
    logging('Exiting from training early', log_=is_logfile)

  result = self.forward(*input, **kwargs)


| epoch   1 | 200/1106 batches | lr 20.0000 | ms/batch 88.35 | loss  6.98 | ppl  1078.04
| epoch   1 | 400/1106 batches | lr 20.0000 | ms/batch 87.71 | loss  6.48 | ppl   650.80
| epoch   1 | 600/1106 batches | lr 20.0000 | ms/batch 87.83 | loss  6.24 | ppl   511.66
| epoch   1 | 800/1106 batches | lr 20.0000 | ms/batch 85.48 | loss  6.11 | ppl   449.99
| epoch   1 | 1000/1106 batches | lr 20.0000 | ms/batch 87.90 | loss  5.97 | ppl   390.08
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 102.67s | valid loss  5.75 | valid ppl   315.14
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch   2 | 200/1106 batches | lr 20.0000 | ms/batch 86.57 | loss  5.81 | ppl   332.66
| epoch   2 | 400/1106 batches | lr 20.0000 | ms/batch 88.90 | loss  5.66 | ppl   287.57
| epoch   2 | 600/1106 batches | lr 20.0000 | ms/batch 86.81 | loss  5.57 | ppl   261.23
| epoch   2 | 800

In [None]:
test_loss = evaluate(test_data, parallel_model, ntokens, test_batch_size, bptt)
logging('=' * 89, log_=is_logfile)
logging('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)), log_=is_logfile)
logging('=' * 89, log_=is_logfile)