## Learning Word Vectors with TensorFlow: Hyperparameter Tuning
Patrick Coady (pcoady@alum.mit.edu)

In [None]:
from wordvector import WordVector
from windowmodel import WindowModel
import docload
from plot_util import plot_results

import numpy as np
import sklearn.utils

In [None]:
files = ['../data/adventures_of_sherlock_holmes.txt',
        '../data/hound_of_the_baskervilles.txt',
        '../data/sign_of_the_four.txt']
word_array, dictionary, num_lines, num_words = docload.build_word_array(
    files, vocab_size=50000, gutenberg=True)
print('Document loaded and processed: {} lines, {} words.'
      .format(num_lines, num_words))

In [None]:
x, y = WindowModel.build_training_set(word_array)

# shuffle and split 10% validation data
x_shuf, y_shuf = sklearn.utils.shuffle(x, y, random_state=0)
split = round(x_shuf.shape[0]*0.9)
x_val, y_val = (x_shuf[split:, :], y_shuf[split:, :])
x_train, y_train = (x[:split, :], y[:split, :])

## Embedding Weights Initialization
Embedding matrix initialized over zero-mean uniform distribution. Try uniform distribution widths of: {0.02, 0.2, 2}

***Result: Width of 2 needed to reliably avoid stalling on plateau***

In [None]:
results_list = []
count = 0
for embed_noise in [0.01, 0.1, 1]:
    for dummy in range(2):  # run each sim twice
        print('{}) embed noise = {}, run #{}'.format(count, embed_noise, dummy))
        count += 1
        graph_params = {'batch_size': 32,
                        'vocab_size': np.max(x)+1,
                        'embed_size': 128,
                        'hid_size': 128,
                        'neg_samples': 64,
                        'learn_rate': 0.002,
                        'embed_noise': embed_noise,
                        'optimizer': 'RMSProp'}
        model = WindowModel(graph_params)
        results = model.train(x_train, y_train, x_val, y_val, epochs=80, verbose=False)
        results_list.append((graph_params, results))

In [None]:
plot_results(results_list)

## Weight Initialization to Output Softmax:
### Standard Normal or Truncated

***Result: Both distributions work equivalently ... use standard normal.***

In [None]:
results_list2 = []
count = 0
for trunc_norm in [True, False]:
    for dummy in range(2):  # run each sim twice
        print('{}) truncated normal? {}, run #{}'.format(count, trunc_norm, dummy))
        count += 1
        graph_params = {'batch_size': 32,
                        'vocab_size': np.max(x)+1,
                        'embed_size': 128,
                        'hid_size': 128,
                        'neg_samples': 64,
                        'learn_rate': 0.002,
                        'embed_noise': 1.0,
                        'trunc_norm': trunc_norm,
                        'optimizer': 'RMSProp'}  
        model = WindowModel(graph_params)
        results = model.train(x_train, y_train, x_val, y_val, epochs=80, verbose=False)
        results_list2.append((graph_params, results))

In [None]:
plot_results(results_list2)

## Weight Initialization to Hidden Layer

sigma = {0.1, 1, 10} * 1/sqrt(# of node inputs)

***Result: 10 too large, 1 and 0.1 both acceptable. Use 0.3.***

In [None]:
results_list3 = []
count = 0
for hid_noise in [0.1, 1, 10]:
    for dummy in range(2):  # run each sim twice
        print('{}) hidden layer sigma {}, run #{}'.format(count, hid_noise, dummy))
        count += 1
        graph_params = {'batch_size': 32,
                        'vocab_size': np.max(x)+1,
                        'embed_size': 128,
                        'hid_size': 128,
                        'neg_samples': 64,
                        'learn_rate': 0.002,
                        'embed_noise': 1.0,
                        'hid_noise': hid_noise,
                        'optimizer': 'RMSProp'}
        model = WindowModel(graph_params)
        results = model.train(x_train, y_train, x_val, y_val, epochs=50, verbose=False)
        results_list3.append((graph_params, results))

In [None]:
plot_results(results_list3)