In [3]:
import sys
sys.path.append('../code/')

In [4]:
import numpy as np
from rnn import RNN
from runner import Runner
import pandas as pd
from utils import invert_dict, load_lm_dataset, docs_to_indices, seqs_to_lmXY
from rnnmath import fraq_loss
import itertools

In [2]:
data_folder = '../data/'
np.random.seed(2018)

In [3]:
learning_rates = [0.5, 0.1, 0.05]
hdims = [25, 50]
lookbacks = [0, 2, 5]

In [4]:
train_size = 1000
dev_size = 1000
vocab_size = 2000
epochs=10

In [5]:
# get the data set vocabulary
vocab = pd.read_table(data_folder + "/vocab.wiki.txt", header=None, sep="\s+", index_col=0,
                      names=['count', 'freq'], )
num_to_word = dict(enumerate(vocab.index[:vocab_size]))
word_to_num = invert_dict(num_to_word)

# calculate loss vocabulary words due to vocab_size
fraction_lost = fraq_loss(vocab, word_to_num, vocab_size)
print(
    "Retained %d words from %d (%.02f%% of all tokens)\n" % (
    vocab_size, len(vocab), 100 * (1 - fraction_lost)))

docs = load_lm_dataset(data_folder + '/wiki-train.txt')
S_train = docs_to_indices(docs, word_to_num, 1, 1)
X_train, D_train = seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = load_lm_dataset(data_folder + '/wiki-dev.txt')
S_dev = docs_to_indices(docs, word_to_num, 1, 1)
X_dev, D_dev = seqs_to_lmXY(S_dev)

X_train = X_train[:train_size]
D_train = D_train[:train_size]
X_dev = X_dev[:dev_size]
D_dev = D_dev[:dev_size]

# q = best unigram frequency from omitted vocab
# this is the best expected loss out of that set
q = vocab.freq[vocab_size] / sum(vocab.freq[vocab_size:])

Retained 2000 words from 9954 (88.35% of all tokens)



In [6]:
s = [learning_rates, hdims, lookbacks]

In [7]:
# Q2.a

# !!! I had to change the train function to make this display in jupyter notebook, which is not allowed
for lr, hdim, lookback in itertools.product(*s):
    r = Runner(model=RNN(vocab_size=vocab_size, hidden_dims=hdim, out_vocab_size=vocab_size))
    r.train(
        X=X_train,
        D=D_train,
        X_dev=X_dev,
        D_dev=D_dev,
        epochs=epochs,
        learning_rate=lr,
        back_steps=lookback
    )
    
    print('######################################################################')
    print('######################################################################')


Training model for 10 epochs
training set: 1000 sentences (batch size 100)
Optimizing loss on 1000 sentences
Vocab size: 2000
Hidden units: 25
Steps for back propagation: 0
Initial learning rate set to 0.5, annealing set to 5

calculating initial mean loss on dev set: 7.798662515757492

epoch 1, learning rate 0.5000	instance 1000	epoch done in 38.54 seconds	new loss: 8.15014653696414
epoch 2, learning rate 0.4167	instance 1000	epoch done in 38.02 seconds	new loss: 5.978210046559437
epoch 3, learning rate 0.3571	instance 1000	epoch done in 38.40 seconds	new loss: 5.746970466401593
epoch 4, learning rate 0.3125	instance 379

KeyboardInterrupt: 