In [None]:
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from nlpia.loaders import get_data
word_vectors = get_data(name='wv')

First:
- load the dataset
- grab the labels
- shuffle the examples. 

Then:
- tokenize it and vectorize it again using Word2vec. 
- get labels.
- split it 80/20 into the training and test sets

In [None]:
def preprocess(dirpath):
    '''
    Load positive and negative example files, combine, and shuffle.
    '''
    pos_path = os.path.join(dirpath, 'pos')
    neg_path = os.path.join(dirpath, 'neg')
    pos_label, neg_label = 1, 0
    data = []

    def label_data(polarity):
        for filename in glob.glob(os.path.join(f"{polarity}_path", "*.txt")):
            with open(filename, "r") as f:
                data.append((f"{polarity}_label", f.read()))
    label_data("pos")
    label_data("neg")

    shuffle(data)
    return data

In [None]:
def tokenize_and_vectorize(data, return_expected=True):
    '''
    Tokenize, vectorize, and split off labels from data
    '''
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []

    for sample in data:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
        expected.append(sample[0])

    if return_expected==True:
        return vectorized_data, expected
    else:
        return vectorized_data

# TODO better OOV handling

In [None]:
data = preprocess("./train")
vectorized_data, expected = tokenize_and_vectorize(data)
split = int(len(vectorized_data)*0.8)
x_train, y_train = vectorized_data[:split], expected[:split]
x_test, y_test = vectorized_data[split:], expected[split:]

In [None]:
# Set hyperparameters 
maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2

In [None]:
import numpy as np 
x_train, x_test = pad_trunc(x_train,  maxlen), pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)

x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

Initialize an empty Keras network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten

num_neurons = 50
model = Sequential()

Keras SimpleRNN:

- pass each input into an RNN 
- for each token: gather the output into a vector

400 = length of each input sequence
x
50 = number of neurons

-->

400 = length of output vector
50 = length of each element in the output vector

Each element in the output vector is a vector containing 50 elements, with one output per neuron, representing the network value at each time step.

$[v_1[50], v_2[50], v_3[50],...,v_{400}[50]]$

If return_sequences was set to False (the Keras default behavior), only a single 50-dimensional vector would be returned.

>> A good rule of thumb is to try to make your model no more complex than the data you’re training on. Easier said than done, but that idea gives you a rationale for adjusting your parameters as you experiment with your dataset. A more complex model will overfit training data and not generalize well; a model that is too simple will underfit the data and also not have much interesting to say about novel data. You’ll see this discussion referred to as the bias versus variance trade-off. A model that’s overfit to the data is said to have high variance and low bias. And an underfit model is the opposite: low variance and high bias; it gets everything wrong in a consistent way.

In [None]:
# add a recurrent layer
model.add(SimpleRNN,
            num_neurons, return_sequences=True,
            input_shape=(maxlen, embedding_dims))

Usually you don't need to truncate and pad data with an RNN, which can handle variable-length inputs from time step to time step. 
If you need to pass the data into a layer that expects uniform lengths, though, you can't have outputs from the RNN that contain variable lengths.

To avoid overfitting, add a dropout layer to zero out some percentage of the full sequences generated by return_sequences.

Add a linear layer as a classifier: Positive sentiment/1 or Negative sentiment/0.
This layer is a dense layer with one neuron and a sigmoid activation function.
It expects a flat vector of $n$ float elements. 

A feed-forward neural network is agnostic to the order of elements, as long as you are consistent with the order. So, take the 400-element vector of 50-dimensional outputs and pass it through the `Flatten()` function, which transforms a 400x50 tensor to a vector of length 20,000. 

In [None]:
model.add(Dropout(0.2)) # zero out 20% of inputs randomly
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

Flattening the tensor before giving the data to the feed-forward network removes information about the order of the input.

The sequential learning itself happened in the RNN layer. The classifier aggregates errors via backpropagation, encoding the relationship in the network.