In [2]:
import numpy as np
import nltk
import itertools
#nltk.download('all')
from nltk.tokenize import sent_tokenize

# Functions

In [3]:
# softmax: take each COLUMN as a score vector
# X: c x n np.ndarray
# return a c x n ndarray
# Caveat: if X is 1 * n array, it will apply on "row" still
def softmax_by_col(X):
    M = X.copy()
    M -= M.max(axis=0)  # for each column, substract the max over row
    eM = np.exp(M) 
    return eM / eM.sum(axis=0)  # softmax for each column

In [4]:
# Caveat: if X is 1 * n array, it will return the same 1 * n array
def softmax(X, by='col'):
    if by == 'col':
        return softmax_by_col(X)
    elif by == 'row':
        return softmax_by_col(X.T).T
    else:
        raise        

In [5]:
# y: nSamp x dim, each row is the real prob.
# y_pred: nSamp x dim, each row is the predicted prob.
# return: sum loss for all samples
def sum_log_loss(y, y_pred):
    return - ( y * np.log(y_pred + 1e-9) ).sum(axis=1).sum()

In [6]:
y = np.asarray([[0., 1., 0., 0.], [0.3, 0.4, 0.2, 0.1]])
y_pred = np.asarray([[0.1, 0.9, 0., 0.], [0.3, 0.4, 0.2, 0.1]])
print sum_log_loss(y, y_pred)
print -np.log(0.9)

1.38521473638
0.105360515658


In [7]:
# x is a sequence/list of index (index in 0 - dim-1), 1 * n
# return: e, n * dim
def one_hot_encode(x, dim):
    #print x, dim
    n = len(x)
    #print n
    e = np.zeros( (n, dim) )
    e[xrange(n), x] += 1
    #print e.shape
    return e

In [8]:
# e: an one-hot code, e.g. [0, 0, 1, 0]
# return: index, e.g. 2
def one_hot_decode(e):
    return np.argmax(e)

In [9]:
# x: a list of encoded vectors (one hot encoded words)
# return: a string
def decode_to_sentence(x):
    sent_t = [ vocab[one_hot_decode(e)][0] for e in x ] # a list of words (string)
    return " ".join(sent_t)

# Load and Prepare Training Data

In [10]:
voc_sz = 2000
unknown = "#"
sent_start = "^"
sent_end = "@"

### Load text files

In [11]:
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading training file..."
with open('sayings.txt', 'rb') as f:
    # body = f.read()
    sents = f.readlines()

Reading training file...


In [12]:
sents = [s.lower().rstrip() for s in sents if len(s)>2]

In [13]:
sents = ["%s %s %s" % (sent_start, s, sent_end) for s in sents]

In [15]:
print "Parsed %d sentences." % (len(sents))
sents[3]

Parsed 567 sentences.


'^ a cat may look at a king @'

In [17]:
# Tokenize the sentences into words
sents_t = [nltk.word_tokenize(s) for s in sents]  # a list of sent_t, i.e. list of words

In [18]:
print "\nExample sentence: %s" % sents[0]
print "\nExample tokenized sentence: %s" % sents_t[546]


Example sentence: ^ a bad penny always turns up @

Example tokenized sentence: ['^', 'work', 'expands', 'so', 'as', 'to', 'fill', 'the', 'time', 'available', '@']


### Build Vocabulary

In [19]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*sents_t)) # a FreqDist object

In [20]:
print "Example word_freq itesm: ", word_freq.items()[:3]
print "Found %d unique words tokens." % len(word_freq.items())

Example word_freq itesm:  [('stones', 3), ('all', 32), ('forget', 1)]
Found 1196 unique words tokens.


In [21]:
voc_sz = min( [voc_sz, len(word_freq.items())] )
print "voc_sz = ", voc_sz

voc_sz =  1196


In [23]:
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common( voc_sz ) # a list of tuple (word, cnt)
print vocab[:10]
print vocab[-10:]

[('^', 567), ('@', 567), ('the', 239), ('a', 183), ('is', 142), ('to', 84), (',', 84), ('you', 72), ("'s", 69), ('and', 68)]
[('inspiration', 1), ('land', 1), ('calls', 1), ('wife', 1), ('age', 1), ('together', 1), ('squeaking', 1), ('greeks', 1), ('accidents', 1), ('expands', 1)]


In [24]:
# word -> an index in vocab, index in [0, voc_sz)
word_to_idx = dict([(w, idx) for idx, (w, cnt) in enumerate(vocab)])

In [26]:
print word_to_idx['^'], word_to_idx['@']

0 1


### Training Data

In [27]:
# Create the training data
# X: each row is a training sample, i.e., an input sequence, or, i.e., a sentence. Each word is an index (0 - voc_sz-1)
# y: each row is a output, i.e., a shift-1-word sequence
X_train = np.asarray([ [word_to_idx[w] for w in sent_t[:-1]] for sent_t in sents_t ])  # n sents_t
Y_train = np.asarray([ [word_to_idx[w] for w in sent_t[1:]] for sent_t in sents_t ])   # n sents_t

In [28]:
print X_train[:3]
print Y_train[:3]

[[0, 3, 92, 136, 47, 511, 135] [0, 3, 436, 91, 19, 617]
 [0, 3, 182, 14, 2, 160, 4, 68, 69, 14, 2, 853]]
[[3, 92, 136, 47, 511, 135, 1] [3, 436, 91, 19, 617, 1]
 [3, 182, 14, 2, 160, 4, 68, 69, 14, 2, 853, 1]]


In [29]:
# one-hot encoding of word indices
X_train_encode = []
for x in X_train: # a sequence of indices
    x_encode = one_hot_encode( x, voc_sz ) # T * dim
    X_train_encode.append( x_encode )
Y_train_encode = []
for y in Y_train: # a sequence of indices
    y_encode = one_hot_encode( y, voc_sz ) # T * dim
    Y_train_encode.append( y_encode )

In [30]:
print X_train_encode[:2]
print Y_train_encode[:2]

[array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])]
[array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.]]), array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]

# Valina RNN Class

In [31]:
class RNN:
     def __init__(self, dim, dim_h=100, bptt_truncate=4):
        # Assign instance variables
        self.dim = dim # dim. of input/output vec, e.g. the embedding vec. of a word in a sequence
        self.dim_h = dim_h # hidden layer
        self.bptt_truncate = bptt_truncate
        # U: input vec to hidden states, dim * dim_h, (so we can dot(x, U))
        self.U = np.random.randn( dim, dim_h ) * np.sqrt( 2. / ( dim * dim_h ) )
        # V: hidden states to output vec, dim_h * dim (so we can dot(V, h) )
        self.V = np.random.randn( dim_h, dim ) * np.sqrt( 2. / ( dim_h * dim ) )
        # W: (prev) hidden state to (next) hidden state 
        self.W = np.random.randn( dim_h, dim_h ) * np.sqrt( 2. / ( dim_h * dim_h ) )

In [32]:
# x: T * dim, a input sequence of vectors, [x1, x2, ..., x_t], each row is a 1 * dim vector
# return: o, output softmax scores, T * dim
#         s, hidden states, (T+1) * dim_h. s[-1] is initial state, all zeros
def forwardProp(self, x):
    T = len(x)
    # Save all hidden states for bptt.
    # Add one initial hidden state, which are 0
    s = np.zeros( (T + 1, self.dim_h) ) # s[-1] are zeros (1 * dim_h)
    o = np.zeros( (T, self.dim) )
    for t in xrange(T):
#         print "t = ", t
#         print "x[t]", x[t]
#         print "x[t].shap", x[t].shape
#         print "U", self.U.shape
#         print "s", s[t-1].shape
#         print "W", self.W.shape
        s[t] = np.tanh( np.dot( x[t], self.U ) + np.dot( s[t-1], self.W ) ) # 1 * dim_h
        o[t] = softmax( np.dot( s[t], self.V ), by='row' )  # 1 * dim 
    return [o, s]
RNN.forwardProp = forwardProp

In [33]:
# x: T * dim, a input sequence of vectors, [x1, x2, ..., x_t], each row is a 1 * dim vector
# return: labels, 1 * T, 
#         For each step t, label is the index of the dimension with max value in the output softmax score,
def predict(self, x):
    o, s = self.forwardProp(x)
    return np.argmax(o, axis=1)
RNN.predict = predict

In [34]:
# X: a batch of n sequences, each sequence is a T * dim array. T may vary by sequence
# Y: a batch of n sentences, each sequence is a T * dim array. T may vary by sequence
# return: the sum log loss on all sequences
def total_loss(self, X, Y):
    n = len(X) # num. of training sequences
    sum_loss = 0.
    for i in xrange(n): # for each sequence
        x = X[i]  # a sequence, T * dim
        y = Y[i]  # a sequence, T * dim
        o, s = self.forwardProp( x ) # T * dim
        loss = sum_log_loss( y, o ) # sum loss in this sequence
        sum_loss += loss
    return sum_loss  # sum loss on all sequences of vectors
RNN.total_loss = total_loss

In [35]:
# X: a batch of n sequences, each sequence is a T * dim array. T may vary by sequence
# Y: a batch of n sentences, each sequence is a T * dim array. T may vary by sequence
# return: the average log loss for each vector
def loss(self, X, Y):
    n = len(X) # num. of training sequences
    sum_loss = 0.
    for i in xrange(n): # for each sequence
        x = X[i]  # a sequence, T * dim
        #print x.shape
        y = Y[i]  # a sequence, T * dim
        #print y.shape
        o, s = self.forwardProp( x ) # T * dim
        #print o.shape
        loss = sum_log_loss( y, o ) # sum loss in this sequence
        #print loss
        sum_loss += loss
    n_vec = np.sum( [len(y) for y in Y] )  
    #print n_vec
    return sum_loss / n_vec  # avg. loss per vector
RNN.loss = loss

In [36]:
### !!!  seeems only for one-hot encoding 
# x: a sequence of vec, T * dim
# y: a sequence of vec, T * dim
def bptt(self, x, y):
    T = len(y)
    o, s = self.forwardProp(x)  # o: T * dim,  s: T * dim_h
    dU = np.zeros_like(self.U)  # dU is actually dL/dU
    dV = np.zeros_like(self.V)
    dW = np.zeros_like(self.W)
    idx_x = np.argmax(x, axis=1) # 1 * T, each entry is an index
    idx_y = np.argmax(y, axis=1) # 1 * T, each entry is an index
    delta_o = o  # T * dim
    delta_o[np.arange(len(idx_y)), idx_y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        #print dV.shape  # dim_h * dim
        dV += np.outer(s[t].T, delta_o[t])
        # Initial delta calculation
        delta_t = np.dot(delta_o[t], self.V.T) * (1 - (s[t] ** 2))   
        # Backpropagation through time (for at most self.bptt_truncate steps)
        steps = np.arange(max(0, t-self.bptt_truncate), t+1)
        for bptt_step in steps[::-1]:
            dW += np.outer(s[bptt_step-1].T, delta_t)              
            dU[idx_x[bptt_step], :] += delta_t
            # Update delta for next step
            delta_t = np.dot(delta_t, self.W.T) * (1 - s[bptt_step-1] ** 2)
    return [dU, dV, dW]
RNN.bptt = bptt

In [37]:
import operator
# x: a sequence of vec, T * dim
# y: a sequence of vec, T * dim
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = self.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            #print [x].shape
            #print [y].shape
            gradplus = self.total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = self.total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error >= error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)
RNN.gradient_check = gradient_check

In [38]:
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
if 0:
    grad_check_vocab_size = 10
    np.random.seed(10)
    model = RNN(grad_check_vocab_size, 5, bptt_truncate=1000)

    #print "U", model.U.shape
    model.gradient_check( one_hot_encode( [0,1,2,3], grad_check_vocab_size), 
                          one_hot_encode( [1,2,3,4], grad_check_vocab_size)
                        )

In [39]:
# Performs one step of SGD on 1 training sequence
# x: a sequence of vec, T * dim
# y: a sequence of vec, T * dim
def sgd_update(self, x, y, step):
    # Calculate the gradients
    dU, dV, dW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U += - step * dU
    self.V += - step * dV
    self.W += - step * dW
RNN.sgd_update = sgd_update

In [40]:
from datetime import datetime
# Training Loop
# - X_train: The training data set, batch of sequences
# - y_train: The training data labels, batch
# - step: learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
def train(self, X_train_encode, Y_train_encode, step=0.005, nepoch=100):
    losses = []     # keep track of the losses
    n_sample_seen = 0  # samples (sequences) have seen
    for epoch in range(nepoch):
        if (epoch % 5 == 0):
            loss = self.loss(X_train_encode, Y_train_encode)
            losses.append( (n_sample_seen, loss) )
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print "%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, n_sample_seen, epoch, loss)
            # Adjust the learning rate if loss increases
            if ( len(losses) >= 2 and losses[-1][1] >= losses[-2][1]):
                step *= 0.5 
                print "Setting learning rate to %f" % step

        n = len(Y_train_encode)
        for i in xrange(n): # train sample by sample, i.e. batch=1
            self.sgd_update(X_train_encode[i], Y_train_encode[i], step)
            n_sample_seen += 1
RNN.train = train

In [41]:
def generate(self):
    x = one_hot_encode( [word_to_idx[sent_start]], self.dim ) # an encoded sentence
    cnt = 0
    while not one_hot_decode( x[-1] ) == word_to_idx[sent_end]:
        o, s = self.forwardProp( x )  # o: T * dim
        e = np.random.multinomial(1, o[-1]) # a sample from MN distri, e.g. [0, 0, 1, 0] for 4-choice multi-nomial.
        x = np.append( x, [e], axis=0 )
        cnt += 1
        if cnt >= 30:  # avoid too long the sentence is
            break 
    return decode_to_sentence( x )
RNN.generate = generate

# Train RNN Model

In [44]:
np.random.seed(10)
print voc_sz
rnn = RNN(dim=voc_sz)

1196


### A test sgd step

In [45]:
%timeit rnn.sgd_update(X_train_encode[10], Y_train_encode[10], step=0.005)

100 loops, best of 3: 12.1 ms per loop


### Train (can repeat)

In [46]:
rnn.train(X_train_encode, Y_train_encode, step=0.005, nepoch=10)

2016-11-12 16:03:20: Loss after num_examples_seen=0 epoch=0: 7.080235
2016-11-12 16:03:58: Loss after num_examples_seen=2835 epoch=5: 5.418642


# Save Model

In [None]:
if 0:
    timestamp = datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    with open('rnn_%s.pkl' % timestamp, 'wb') as f:
        pickle.dump(rnn, f, pickle.HIGHEST_PROTOCOL)

# Load Model

In [50]:
if 0:
    import pickle
    with open('rnn.pkl', 'rb') as f:
        rnn = pickle.load(f)

# Test the RNN Model

In [51]:
num_sents = 20
for i in range(num_sents):
    print rnn.generate()

^ attack is the best form @
^ saturday 's child works hard for its living , @
^ the labourer is worthy of his hire @
^ never judge a book by its evil @
^ distance wise enchantment to the view @
^ for want of a nail the shoe was lost ; for want of a shoe the man was lost ; and worth want of up with a shoe in everything
^ a person is known by the company he keeps @
^ god vessels those who most themselves @
^ parsley half goes nine @
^ flattery will get you nowhere @
^ ask a silly question and you 'll get a sow answer @
^ no man can serve two masters @
^ every little helps two us to suck @
^ the course of sin is a season forever @
^ as you make your bed , so you must lie upon it @
^ give a dog a bad head and hang him @
^ it never easy but it the same to sings no pin in the going @
^ one hand washes the other @
^ all 's fair that love and suspicion @
^ you ca n't make with the hare and red sky @
