In [14]:
import sys
import tensorflow as tf
import numpy as np
import re 

## Some Helper Functions

#### Read, Clean, and List Words to Use
I need to get a vocab size to use in my computational graph, so I need to have a list of words to use to get the length. I'll use this to preprocess the corpus. 

In [15]:
def read_in(file):
    with open(filepath, "r") as filetext:
        word_ls = filetext.read().replace("\n", "<eos>").split()
    # Clean the vocab from the list and return clean list
    random = re.compile(r'[.a-zA-Z0-9]')
    return [i for i in ls_words if (random.search(i) or i == '<eos>')]

#### Make Batch Sizes

An epoch is composed of one or more batches. Batch size is a hyperparameter of gradient descent that controls the number of training samples to work through before the model's internal parameters are updated, while epochs is also a hyperparameter of gradient descent that controls the number of complete passes through the dataset. 

In [16]:
def make_batch(data, batch_size, num_steps):
    x_data = []
    y_data = []
    for i in range(len(data)):
        if i > num_steps - 1:
            x_data.append(data[i - num_steps:i])
            y_data.append(data[i])
    batches = int(len(x_data) / batch_size)
    batch_out = list()
    for i in range(batches):
        # Per each batch
        start_i = batch_size
        end_i = start_i + batch_size
        x_values = x_data[start_i:end_i]
        y_values = y_data[start_i:end_i]
        batch_out.append([x_values, v_values])
    return batch_out

#### Split Up the Brown Corpus Into Workable Bites
For this, I'm actually going away from Bengio's implementation (might come back to bite me, but meh). Instead of 800,000 words in the training dataset, I'm only going to put in half of that. 

In [17]:
def split_brown():
    # retrieving the brown.txt file
    with open('data/brown.txt') as file:
        brown_list = file.read().split()
        
    # creating the training dataset
    training_file = open("data/b_train.txt", "w")
    training_brown = ' '.join(brown_list[:400000])
    training_file.write(training_brown)
    training_file.close()
    
    # creating the validation dataset
    validation_file = open("data/b_valid.txt", "w")
    validation_brown = ' '.join(brown_list[400000:500000])
    validation_file.write(training_brown)
    validation_file.close()
    
    # creating the testing dataset
    testing_file = open("data/b_test.txt", "w")
    testing_brown = ' '.join(brown_list[500000:600000])
    testing_file.write(testing_brown)
    testing_file.close()

## Creating a Configuration 

I found it easier to use `sys.argv[]` to create command lines than `argparse`, which I saw several others do. With sys.argv, it takes in all extra strings that come after the initial `python ____.py` as list items, and I can just assign variables to the item in the list.

For the configurations, I'm going off of what Bengio has listed on page 1149 for his comparative results. For this assignment, we were instructed to compare our own results from MLP1, MLP3, MLP5, MLP7 and MLP9 for both the Brown and WikiText corpora. I just made each configuration a dictionary of dictionaries, with each key being the name of the configuration, followed by the second key being the name of the variable, with the value being what Bengio assigned them.

In [18]:
mode = sys.argv[1]
corpus = sys.argv[2]
config = sys.argv[3]

potential_modes = ['train', 'restore']

potential_corpora = ['brown', 'wiki'] 

potential_configurations = {
    'MLP1': {'num_steps':5, 'hidden_units':50, 'word_features':60, 'direct': True},
    'MLP3': {'num_steps':5, 'hidden_units':0, 'word_features':60, 'direct': True},
    'MLP5': {'num_steps':5, 'hidden_units':50, 'word_features':30, 'direct': True},
    'MLP7': {'num_steps':3, 'hidden_units':50, 'word_features':30, 'direct': True},
    'MLP9': {'num_steps':5, 'hidden_units':100, 'word_features':30, 'direct': False}
}

IndexError: list index out of range

## Preprocessing the Corpus

## Vectorizing the Words

## Initializing the Parameters of the Graph

In [21]:
def __init__(self):
    self.batch_size = 128
    self.word_features = run_configuration['word_features']
    self.num_steps = run_configuration['num_steps']
    self.hidden_units = run_configuration['hidden_units']
    self.direct_connections = run_configuration['direct']
    
    # to use a GPU or CPU 
    if tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
        self.device = '/gpu:0'
        print('Currently using GPU device to run code')
    # need to figure out how to run this on a TPU
    else:
        self.device = '/cpu:0'
        print('There is no GPU available, using CPU device to run code')
    
    # for the stochastic gradient descent
    self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False)
    slr = 0.001
    self.learning_rate = tf.train.exponential_decay(slr, self.global_step, 100, 0.96, staircase=True)

In [22]:
def placeholders():
    self.training_inputs = tf.placeholder(tf.float32, shape=[self.batch_size, self.num_steps])
    self.training_labels = tf.placeholder(tf.float32, shape=[batch_size, [None]])

In [23]:
def embeddings(self):
    with tf.device(self.device):
        self.embedding_matrix = tf.Variable(tf.random_uniform([V, self.word_features], -1.0, 1.0))
        # might need to flatten training_inputs 
        self.embedding_inputs = tf.nn.embed_lookup(embedding_matrix, training_inputs)
        self.xt = tf.reshape(embed_inputs, (self.batch_size, self.z))

In [27]:
def loss(self):
    self.z = self.word_features * self.num_steps
    with tf.device(self.device):
        # output biases
        b = tf.Variable(tf.random_uniform([V]))
        # hidden layer biases
        d = tf.Variable(tf.random_uniform([self.hidden_units]))

        # W (word features to output weights)
        if direct_connections == True:
            W = tf.Variable(tf.random_normal([z, V], -1.0, 1.0))
        else:
            W = tf.Variable(np.zeros([z, V]), trainable=False)

        # H (hidden layer weights)
        H = tf.Variable(tf.random_uniform([z, self.hidden_units], -1.0, 1.0))

        # U (hidden-to-output weights)
        U = tf.Variable(tf.random_uniform([self.hidden_units, V]))

        # building the graph (i.e. just a set of matrix multiplications)
        hidden = tf.tanh(tf.matmul(self.xt, H) + d)
        hidden2out = tf.matmul(hidden, U) + b
        self.logits = tf.matmul(self.xt, W) + hidden2out
        self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.one_hot(self.training_labels, V))

In [28]:
# building the optimizer
def optimizer(self):
    with tf.device(self.device):
        self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)

## Training w/ `tf.Session`

I can use the `feed_dict` object to feed values to the placeholders that I defined earlier in the graph. This will allow TF to compute the `loss` variable, but I'll also need to add an optimizer (very easy w/ TensorFlow). 

In [29]:
def summarize(self):
    # For tensorboard visualization, run main.py then in command line prompt type:
    # tensorboard --logdir="./graphs" --port 6006
    # then open browser to http://localhost:6006/
    tf.summary.scalar("loss", self.loss)
    summary_op = tf.summary.merge_all()

In [30]:
def build_graph(self):
    self.placeholders()
    self.embeddings()
    self.loss()
    self.optimizer()
    self.summarize()

## Restoring the Model 

## Testing the Model 

## Seeing the Results 

#### Creating a Plotting Visualization 

In [None]:
if __name__ == "__main__":
    if mode not in potential_modes or corpus not in potential_corpora or config not in potential_configurations:
        print("Please enter in a valid input as an argument")
        sys.exit()
    elif mode == 'train':
        if corpora == 'brown':
            split_brown()
            t_path = "data/b_train.txt"
            v_path = "data/b_valid.txt"
            s_path = "data/b_test.txt"
        elif corpora == 'wiki':
            t_path = "data/w_train.txt"
            v_path = "data/w_valid.txt"
            s_path = "data/w_test.txt"
            
        # get the configuration
        run_configuration = potential_configurations[config]
        # get the preprocessed training text
        pp_tt = Preprocessor(t_path)
        # get the size of the vocabulary
        V = pp_tt.V
        # training data
        training_data = pp_tt.generate_data(t_path)
        # validation data
        validation_data = pp_tt.generate_data(v_path)
        # testing data
        testing_data = pp_tt.generate(s_path)
        # creating the training_acc & training_cost
        t_acc, t_loss = [.1] * 10, [7] * 10
        # calling the model object
        model = Bengio()
        # calling the train_model() method 
        model.build_graph(training_data, validation_data)
        # visualize it!
        plot(t_acc[10:], t_loss[10:])
        
    elif mode == 'restore':
        if corpora == 'brown':
            split_brown()
            t_path = "data/b_train.txt"
            v_path = "data/b_valid.txt"
            s_path = "data/b_test.txt"
        elif corpora == 'wiki':
            t_path = "data/w_train.txt"
            v_path = "data/w_valid.txt"
            s_path = "data/w_test.txt"
        
        # get the configuration
        run_configuration = potential_configurations[config]
        # get the preprocessed training text
        pp_tt = Preprocessor(t_path)
        # get the size of the vocabulary
        V = pp_tt.V
        # testing data
        testing_data = pp_tt.generate(s_path)
        # calling the model object
        model = Bengio()
        # restore the model 
        model.restore_model('../models/' + corpus + '_' + config + '.ckpt')