In [8]:
import csv
import  itertools 
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

def save_model_parameters_theano(outfile, model):
    U, V, W = model.U.get_value(), model.V.get_value(), model.W.get_value()
    np.savez(outfile, U=U, V=V, W=W)
    print ("Saved model parameters to %s." % outfile)

def load_model_parameters_theano(path, model):
    npzfile = np.load(path)
    U, V, W = npzfile["U"], npzfile["V"], npzfile["W"]
    model.hidden_dim = U.shape[0]
    model.word_dim = U.shape[1]
    model.U.set_value(U)
    model.V.set_value(V)
    model.W.set_value(W)
    print ("Loaded model parameters from %s. hidden_dim=%d word_dim=%d" % (path, U.shape[0], U.shape[1]))

In [1]:
import csv
import nltk
nltk.download('punkt')
import itertools

vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print ("Reading CSV file...")
with open('mldata/reddit-comments-2015-08.csv', 'r') as f:
#     reader = csv.reader(f, skipinitialspace=True)
    reader = csv.DictReader(f)
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x['body'].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print ("Parsed %d sentences." % (len(sentences)))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print ("Using vocabulary size %d." % vocabulary_size)
print ("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print ("\nExample sentence: '%s'" % sentences[0])
print ("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

[nltk_data] Downloading package punkt to /home/markroxor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Reading CSV file...
Parsed 79170 sentences.
Found 65441 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'crank' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'


In [2]:
import numpy as np
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [3]:
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print ("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 858, 54, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 858, 54, 25, 34, 69, 1]


In [4]:
class  RNNNumpy :
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [5]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [6]:
def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict


In [12]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])

print(X_train[10])
print (o.shape)
print (o)

[0, 72, 63, 13, 124, 5, 26, 1128, 208, 5, 324, 3, 329, 4, 112, 32, 75, 7, 4745, 4, 8, 84, 52, 9, 7, 3155, 1021, 492, 7535, 8, 133, 48, 3096, 4, 10, 95, 51, 4, 128, 17, 37, 314, 577, 2, 40]
(45, 8000)
[[0.00012408 0.0001244  0.00012603 ... 0.00012515 0.00012488 0.00012508]
 [0.00012536 0.00012582 0.00012436 ... 0.00012482 0.00012456 0.00012451]
 [0.00012387 0.0001252  0.00012474 ... 0.00012559 0.00012588 0.00012551]
 ...
 [0.00012471 0.0001243  0.00012524 ... 0.00012475 0.00012522 0.00012623]
 [0.00012564 0.00012431 0.00012481 ... 0.0001244  0.00012609 0.00012486]
 [0.00012447 0.00012509 0.00012469 ... 0.00012473 0.00012506 0.00012641]]


In [17]:
print(len(X_train[10]))
print(X_train[10])

45
[0, 72, 63, 13, 124, 5, 26, 1128, 208, 5, 324, 3, 329, 4, 112, 32, 75, 7, 4745, 4, 8, 84, 52, 9, 7, 3155, 1021, 492, 7535, 8, 133, 48, 3096, 4, 10, 95, 51, 4, 128, 17, 37, 314, 577, 2, 40]


In [37]:
predictions = model.predict(X_train[10])
print (predictions.shape)
print (predictions)

(45,)
[1284 5221 7653 7430 1013 3562 7366 1874  224 6601 7299 6722 6892 3198
 4480 5853 2926  261  489  760 1810 5376 4146  477 7051 5981 1549 3765
 2493 1835 1900 4323 2579 5879 4864 5132 6569 2800 2752 6821 4437 7021
 3943 6912 3922]


In [38]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [39]:
# Limit to 1000 examples to save time
print ("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print ("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.987197
Actual loss: 8.987406


In [40]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [43]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print ("Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print ("Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix))
                print ("+h Loss: %f" % gradplus)
                print ("-h Loss: %f" % gradminus)
                print ("Estimated_gradient: %f" % estimated_gradient)
                print ("Backpropagation gradient: %f" % backprop_gradient)
                print ("Relative Error: %f" % relative_error)
                return 
            it.iternext()
        print ("Gradient check for parameter %s passed." % (pname))

RNNNumpy.gradient_check = gradient_check

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.




Gradient check for parameter U passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [44]:
# Performs one step of SGD.
def numpy_sdg_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [46]:
# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print ("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print ("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [47]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

140 ms ± 5.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [67]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch=100, evaluate_loss_after=1)

2018-06-27 17:02:50: Loss after num_examples_seen=0 epoch=0: 8.987250
2018-06-27 17:02:56: Loss after num_examples_seen=100 epoch=1: 8.975955
2018-06-27 17:03:03: Loss after num_examples_seen=200 epoch=2: 8.959699
2018-06-27 17:03:09: Loss after num_examples_seen=300 epoch=3: 8.929399
2018-06-27 17:03:16: Loss after num_examples_seen=400 epoch=4: 8.845521
2018-06-27 17:03:23: Loss after num_examples_seen=500 epoch=5: 6.778843
2018-06-27 17:03:30: Loss after num_examples_seen=600 epoch=6: 6.263977
2018-06-27 17:03:36: Loss after num_examples_seen=700 epoch=7: 5.999944
2018-06-27 17:03:43: Loss after num_examples_seen=800 epoch=8: 5.829725
2018-06-27 17:03:49: Loss after num_examples_seen=900 epoch=9: 5.714151
2018-06-27 17:03:56: Loss after num_examples_seen=1000 epoch=10: 5.630104
2018-06-27 17:04:02: Loss after num_examples_seen=1100 epoch=11: 5.564262
2018-06-27 17:04:10: Loss after num_examples_seen=1200 epoch=12: 5.510284
2018-06-27 17:04:17: Loss after num_examples_seen=1300 epoch

In [62]:
for a, b in zip(X_train[100:105], y_train[100:105]):
    print(a, b)

[0, 50, 17, 154, 83, 7, 3413, 561, 4578, 4, 172, 453, 9, 5417, 13, 813, 473, 2] [50, 17, 154, 83, 7, 3413, 561, 4578, 4, 172, 453, 9, 5417, 13, 813, 473, 2, 1]
[0, 92, 5688, 7999, 17, 7999, 13, 7999, 5, 3, 2131, 2] [92, 5688, 7999, 17, 7999, 13, 7999, 5, 3, 2131, 2, 1]
[0, 25, 13, 4751, 33, 11, 13, 328, 515, 9, 7999, 2] [25, 13, 4751, 33, 11, 13, 328, 515, 9, 7999, 2, 1]
[0, 4751, 7999, 4, 4751, 4957, 2] [4751, 7999, 4, 4751, 4957, 2, 1]
[0, 25, 269, 87, 16, 1776, 7999, 3, 145, 111, 2] [25, 269, 87, 16, 1776, 7999, 3, 145, 111, 2, 1]


In [66]:
def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        next_word_probs, _ = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
#         print(index_to_word[sampled_word], sampled_word)
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str
 
num_sentences = 10
senten_min_length = 7
 
for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print (" ".join(sent))

play it not the have out notifications have and charles n't they all expanded the adventures to the ’ or http observe who . the the lean of it where there and to on the editor for it it
kickstarter valuable skyrim prison or if zone joe going so the .
finale the tears the and n't with .
detection all is to but new use uncle have on ’ is weapon of his and require .
point fighting shaped been serve fuel to than the the sound i of cause some the it consumers have did the the 'm is the and 2016 bikes reports hope reports the /r/askscience mediocre brisbane n't the be when would walls the `` amazingly .
eventually it combo like the we units our to aus to entry the them rick was annoying the
jacket have a not the number '' for puts do for
studied rumors there after would n't ) consequence them in overwhelmed or on not 300 than be of the include i obligation the
pen thursday not on and oblivious judge going the guitar & get had still ; them mod and crap a are presented the everywhere the film 

In [68]:
def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        next_word_probs, _ = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
#         print(index_to_word[sampled_word], sampled_word)
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str
 
num_sentences = 10
senten_min_length = 7
 
for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print (" ".join(sent))

the just wo a trigger her and rebates it .
it it is trapped require of vote sellers enter resolution , like year yards 3 young or n't .
ppr gt toss i making be when they do made at be on probably anyone no to otherwise that my .
no never years young utilize players find the and run gun , like even , and my had in but did logic she minimal ability did but i the n't to been year
fucking i out you back do this bloody 3-4 quad-core the would `` really if as waking 's : the whether to if ppr side .
reform gt used you have we the get for n't de of still other kicked more supporting of be rolls with i been defense but turn other , i nearly 'll that to solid the of a close be check this 300 the out .
bonuses person you ) ie anyone like be it and i and be them i regard i remember player other must but require n't mandatory n't not fact not ( existing who `` form , .
outside having ; take an also `` i qb in playing .
i 's judge yards for it before '' 's restricted .
a he the , like in attacking 

In [None]:
X_