In [30]:
from __future__ import print_function, division

import os
import sys
import re
import pdb
import time

import numpy as np
import scipy as sp
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

print('sys_version:', sys.version.replace('\n', ''))
print('virtual_env', os.environ.get('VIRTUAL_ENV', 'None'))
print('pwd', os.getcwd())
print('np ', np.__version__)
print('tf ', tf.__version__)

sys_version: 3.5.2 (default, Nov 17 2016, 17:05:23) [GCC 5.4.0 20160609]
virtual_env None
pwd /home/marko/Projects/faks/DU/DU3
np  1.11.1
tf  0.10.0


# dataset.py

In [162]:
from collections import Counter


class Dataset:
    
    def __init__(self, batch_size, sequence_length):
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.batch_index = 0
    
    def preprocess(self, input_file):
        with open(input_file, "r") as f:
            data = f.read()

        # count and sort most frequent characters
        chars, cnts = np.unique(list(data), return_index=True)
        self.sorted_chars = chars[np.argsort(-cnts)]
        self.vocab_size = len(self.sorted_chars)
        
        # other way
        #cntr = Counter(data)
        #self.sorted_chars = sorted(cntr.keys(), key=cntr.get, reverse=True)

        # self.sorted chars contains just the characters ordered descending by frequency
        self.char2id = dict(zip(self.sorted_chars, range(len(self.sorted_chars)))) 
        self.id2char = {k:v for v,k in self.char2id.items()}
        self.x = np.array(list(map(self.char2id.get, data)))

    def encode(self, sequence):
        return [self.char2id[c] for c in sequence]

    def decode(self, encoded_sequence):
        return [self.id2char[c] for c in encoded_sequence]
        
    def create_minibatches(self):
        data_len = len(self.x)
        chars_per_batch = self.batch_size * self.sequence_length
        self.num_batches = int((data_len-1) / chars_per_batch) 
 
        self.batches = np.zeros([self.num_batches, self.batch_size, self.sequence_length + 1], dtype=np.int32)      
        for b in range(self.num_batches):
            for s in range(self.batch_size):
                sentance_start = s*(self.num_batches*self.sequence_length)
                start = b * self.sequence_length + sentance_start
                end = start + self.sequence_length + 1 
                self.batches[b, s, :] = self.x[start:end]
                        
        self.batch_index = 0

    def next_minibatch(self):
        new_epoch = self.batch_index == self.num_batches
        if new_epoch:
            self.batch_index = 0

        batch = self.batches[self.batch_index, :, :]
        self.batch_index += 1
        
        batch_x = batch[:, :-1]
        batch_y = batch[:, 1:]
        return new_epoch, batch_x, batch_y
    
    def _as_one_hot(self, x, vocab):
        n = len(x)
        Yoh = np.zeros((n, vocab))
        Yoh[np.arange(n), x] = 1
        return Yoh
    

    def one_hot(self, batch):
        if batch.ndim == 1:
            return self._as_one_hot(batch, self.vocab_size)
        else:
            return np.array([self._as_one_hot(s, self.vocab_size) for s in batch])

In [163]:
# test 1
dat = Dataset(3, 3)
dat.preprocess("test.txt")
txt = "hjdhasjdhjasdhja"
assert txt != dat.decode(dat.encode(txt))


dat.create_minibatches()
for i in range(dat.num_batches):
    print("Batch:", i)
    f, s, t = dat.next_minibatch()
    print(f)
    print("X", list(map(dat.decode, s)))
    print("Y", list(map(dat.decode, t)))
    print("\n\n")

Batch: 0
False
X [['a', 'b', 'c'], ['g', 'h', 'i'], ['m', 'n', 'o']]
Y [['b', 'c', 'd'], ['h', 'i', 'j'], ['n', 'o', 'p']]



Batch: 1
False
X [['d', 'e', 'f'], ['j', 'k', 'l'], ['p', 'q', 'r']]
Y [['e', 'f', 'g'], ['k', 'l', 'm'], ['q', 'r', 's']]





In [None]:
class RNN:
    
    def __init__(self, hidden_size, sequence_length, vocab_size, learning_rate):
        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.learning_rate = learning_rate
        
        # Xavier
        self.U = np.random.normal(size=[vocab_size, hidden_size], scale=1.0 / np.sqrt(hidden_size))  # ... input projection
        self.W = np.random.normal(size=[hidden_size, hidden_size], scale=1.0 / np.sqrt(hidden_size))  # ... hidden-to-hidden projection
        self.b = np.zeros([1, hidden_size])
        
        
        self.V = np.random.normal(size=[hidden_size, vocab_size], scale=1.0 / np.sqrt(vocab_size))  # ... output projection
        self.c = np.zeros([1, vocab_size]) # ... output bias

        


        
        # memory of past gradients - rolling sum of squares for Adagrad
        self.memory_U, self.memory_W, self.memory_V = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        self.memory_b, self.memory_c = np.zeros_like(self.b), np.zeros_like(self.c)
    
    
    def rnn_step_forward(self, x, h_prev, U, W, b):
        # A single time step forward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.

        # x - input data (minibatch size x input dimension)
        # h_prev - previous hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1).T
        
        
        h_current = np.tanh(np.dot(h_prev, W) + np.dot(x, U) + b)
        cache = (W, x, h_prev, h_current)
        return h_current, cache
    
    def rnn_forward(self, x, h0, U, W, b):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity

        # x - input data for the whole time-series (minibatch size x sequence_length x input dimension)
        # h0 - initial hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1).T
        
        h, cache = [h0], []
        for t in range(self.sequence_length):
            data = x[:, t, :] #t-th entry
            current_h, current_cache = self.rnn_step_forward(data, h[-1], U, W, b)
            h.append(current_h)
            cache.append(current_cache)


        # return the hidden states for the whole time series (T+1) and a tuple of values needed for the backward step
        return h, cache

    
    
    def rnn_step_backward(self, grad_next, cache):
        # A single time step backward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.

        # grad_next - upstream gradient of the loss with respect to the next hidden state and current output
        # cache - cached information from the forward pass
        
        W, x, h_prev, h_curr = cache
        dz = grad_next * (1 - h_curr**2)
        
        dh_prev = np.dot(dz, W.T)
        dU = np.dot(x.T, dz)
        dW = np.dot(h_prev.T, dz)
        db = np.sum(dz, axis=0)
        
        return dh_prev, dU, dW, db


    def rnn_backward(self, dh, cache):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity
        dU, dW, db = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.b)
        

        # compute and return gradients with respect to each parameter
        # for the whole time series.
        upstream_grad = np.zeros_like(dh[0])
        for dh_t, cache_t in reversed(zip(dh, cache)):
            upstream_grad, dU_t, dW_t, db_t = self.rnn_step_backward(dh_t + upstream_grad, cache_t)
            dU += dU_t; dW += dW_t; db += db_t; 
        
        clip = lambda x: np.clip(x, -5, 5)
        return clip(dU), clip(dW), clip(db)
    
    
    
    def output(h, V, c):
        # Calculate the output probabilities of the network
        return np.dot(h, V) + c
    
    def softmax(self, o):
        exp = np.exp(o)
        s = exp / np.sum(exp, axis=1, keepdims=True)
        return s
    
    def output_loss_and_grads(self, h, V, c, y):
        # Calculate the loss of the network for each of the outputs

        # h - hidden states of the network for each timestep. 
        #     the dimensionality of h is (batch size x sequence length x hidden size (the initial state is irrelevant for the output)
        # V - the output projection matrix of dimension hidden size x vocabulary size
        # c - the output bias of dimension vocabulary size x 1
        # y - the true class distribution - a one-hot vector of dimension 
        #     vocabulary size x 1 - you need to do this conversion prior to
        #     passing the argument. A fast way to create a one-hot vector from
        #     an id could be something like the following code:

        #   y[timestep] = np.zeros((vocabulary_size, 1))
        #   y[timestep][batch_y[timestep]] = 1

        #     where y might be a dictionary.

        loss, dh, dV, dc = 0.0, [], np.zeros_like(self.V), np.zeros_like(self.c)
        for t in range(self.sequence_length):
            yp = y[:, t, :]
            h_t = h[:, t, :]
            
            o = self.output(h_t, V, c)
            s = self.softmax(o)
            
            dO = s - yp
            
            dV += np.dot(h_t.T, dO)
            dc += np.sum(dO, axis=0)
            
            dh_t = np.dot(dO, V.T)
            dh.append(dh_t)
            loss += -np.sum(np.log(s)*yp)
            
        loss /= self.batch_size
        return loss, dh, dV, dc
    
    
    
    def update(self, dU, dW, db, dV, dc):
        eps = 1e-7
        
        # update memory matrices
        # perform the Adagrad update of parameters
        mean = lambda values: [v/self.batch_size for v in values]
        dU, dW, db, dV, dc = mean([dU, dW, db, dV, dc])
        
        self.memory_U += np.square(dU)
        self.memory_W += np.square(dW)
        self.memory_b += np.square(db)
        self.memory_V += np.square(dV)
        self.memory_c += np.square(dc)
        
        update_param = lambda dx, mem_x: self.learning_rate * dx / np.sqrt(mem_x + eps)
        
        self.U -= update_param(dU, self.memory_U)
        self.W -= update_param(dW, self.memory_W)
        self.b -= update_param(db, self.memory_b)
        self.V -= update_param(dV, self.memory_V)
        self.c -= update_param(dc, self.memory_c)
        
    def step(self, h, x, y):
        h, cache = self.rnn_forward(x, h, self.U, self.W, self.b)
        loss, dh, dV, dc = self.output_loss_and_grads(h, self.V, self.c, y)
        dU, dW, db = self.rnn_backward(dh, cache)
        self.update(dU, dW, db, dV, dc)
        return loss, h[:, -1, :]


In [None]:
def sample(rnn, seed, n_sample, dataset):
    h0 = np.zeros([1, rnn.hidden_size])
    seed_oh = dataset.one_hot(dataset.encode(seed))
    
    sampled = []
    for c_oh in seed_oh:
        h0, _ = rnn.rnn_step_forward(c_oh.reshape([1, -1]), h0, rnn.U, rnn.W, rnn.b)
        sampled.append(np.argmax(c_oh))
    
    for i in range(len(seed), n_sample):
        in_oh = dataset.one_hot(np.array(sampled[-1])) # prev char
        h0, _ = rnn.rnn_step_forward(in_oh, h0, rnn.U, rnn.W, rnn.b)
        out = rnn.output(h0, rnn.V, rnn.c)
        sampled.append(np.argmax(out))
  
    return dataset.decode(sampled)


def run_language_model(dataset, max_epochs, hidden_size=100, sequence_length=30, learning_rate=1e-1, sample_every=100):
    
    vocab_size = len(dataset.sorted_chars)
    rnn = RNN(hidden_size, sequence_length, vocab_size, learning_rate)

    current_epoch = 0 
    batch = 0

    h0 = np.zeros((dataset.batch_size, hidden_size))
    average_loss = 0

    while current_epoch < max_epochs: 
        e, x, y = dataset.next_minibatch()
        
        if e: 
            current_epoch += 1
            h0 = np.zeros((hidden_size, 1))

        # One-hot transform the x and y batches
        x_oh, y_oh = dataset.one_hot(x), dataset.one_hot(y)


        loss, h0 = rnn.step(h0, x_oh, y_oh)
        cum_loss += loss
        
        if batch % sample_every == 0: 
            pass
           
        
        if batch % 100 == 0:
            current_batch = batch % dataset.num_batches
            print("epoch: %06d:\tbatch: %4d/%d\t" % (current_epoch, current_batch, dataset.num_batches), end="")
            print("Average_loss: %.4f" % (cum_loss/(batch*dataset.batch_size)))
            
        batch += 1


In [None]:
dataset = Dataset(30, 15)
dataset.preprocess("dataset/selected_conversations.txt")
dataset.create_minibatches()
run_language_model(dataset, 100000, sequence_length=dataset.sequence_length)