# Mixture of Softmaxes (RNN LM)

# TODO

* MoS (just straight update from https://github.com/zihangdai/mos which is PyTorch 0.2.0)

In [9]:
import os, sys
sys.path.append("./mos/")
import time
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# from torch.autograd import Variable

import gc

import mos_data as data
import model as m

from utils import batchify, get_batch, repackage_hidden, create_exp_dir, save_checkpoint

In [12]:
def logging(s, print_=True, log_=True):
    if print_:
        print(s)
    if log_:
        with open(os.path.join(args.save, 'log.txt'), 'a+') as f_log:
            f_log.write(s + '\n')

In [3]:
# Set the random seed manually for reproducibility.
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
is_cuda = torch.cuda.is_available()
if is_cuda:
    torch.cuda.manual_seed_all(seed)

In [4]:
# Load data
datafile = "./data/penn/"
train_batch_size = 12
eval_batch_size = 10
test_batch_size = 1
corpus = data.Corpus(datafile)

train_data = batchify(corpus.train, train_batch_size, is_cuda)
val_data = batchify(corpus.valid, eval_batch_size, is_cuda)
test_data = batchify(corpus.test, test_batch_size, is_cuda)

Size of generated data = torch.Size([77465, 12])
Size of generated data = torch.Size([7376, 10])
Size of generated data = torch.Size([82430, 1])


In [14]:
# Build model

ntokens = len(corpus.dictionary)
is_keep_training = False
path2saved_model = ""
# Use parameters from first example in original repository
# python main.py --data data/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 
# --lr 20.0 --epoch 1000 --nhid 960 --nhidlast 620 --emsize 280 --n_experts 15 --save PTB --single_gpu

# Type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, SRU)
model_type = "LSTM"
# Size of embedding dimension
emsize = 280
# Number of hidden units per every RNN layer except the last one
nhid = 960
# Number of hidden units for the last RNN layer
nhidlast = 620
# Number of RNN layers
nlayers = 3
# Dropout after the last RNN layer
dropout = 0.3 # default
# Dropout for RNN layers
dropouth = 0.225
# Dropout for input embedding layers
dropouti = 0.4
# Dropout to remove words from embedding layer
dropoute = 0.1 # default
# Dropout for latent representation, before decoding
dropoutl = 0.29
# Amount of weight dropout to apply to the RNN hidden to hidden matrix
# Strange dropout
wdrop = 0.5 # default
# Tie the word embedding and softmax weights
tied = False
# Number of softmaxes to mix
n_experts = 15

if is_keep_training:
    model = torch.load(os.path.join(path2saved_model, 'model.pt'))
else:
    model = m.RNNModel(model_type, ntokens, emsize, nhid, nhidlast, nlayers, 
                       dropout, dropouth, dropouti, dropoute, wdrop, 
                       tied, dropoutl, n_experts)

if is_cuda:
    parallel_model = model.cuda()
else:
    parallel_model = model

total_params = sum(x.data.nelement() for x in model.parameters())
# logging('Args: {}'.format(args))
logging('Model total parameters: {}'.format(total_params), log_=False)

criterion = nn.CrossEntropyLoss()


Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
param size: 24300620
Model total parameters: 24300620


In [16]:
# Evaluate model
# seq_lenght us strange parameter
def evaluate(data_source, model, corpus, batch_size, seq_lenght):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, seq_lenght):
        data, targets = get_batch(data_source, i, seq_lenght, evaluation=True)
        targets = targets.view(-1)

        log_prob, hidden = model(data, hidden)
        loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data

        total_loss += loss * len(data)

        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)