In [1]:
from __future__ import print_function, division

import os
import sys
import re
import pdb
import time

import numpy as np
import scipy as sp
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

print('sys_version:', sys.version.replace('\n', ''))
print('virtual_env', os.environ.get('VIRTUAL_ENV', 'None'))
print('pwd', os.getcwd())
print('np ', np.__version__)
print('tf ', tf.__version__)


from IPython.core.debugger import Tracer
BREAK_POINT = lambda: Tracer()()

sys_version: 3.5.2 (default, Nov 17 2016, 17:05:23) [GCC 5.4.0 20160609]
virtual_env None
pwd /home/marko/Projects/faks/DU/DU3
np  1.11.1
tf  0.10.0


# dataset.py

In [2]:
from dataset import Dataset

In [3]:
# test 1
dat = Dataset(3, 3)
dat.preprocess("test.txt")
txt = "hjdhasjdhjasdhja"
assert txt != dat.decode(dat.encode(txt))


dat.create_minibatches()
for i in range(dat.num_batches):
    print("Batch:", i)
    f, s, t = dat.next_minibatch()
    print(f)
    print("X", list(map(dat.decode, s)))
    print("Y", list(map(dat.decode, t)))
    print("\n\n")

Batch: 0
False
X [['a', 'b', 'c'], ['g', 'h', 'i'], ['m', 'n', 'o']]
Y [['b', 'c', 'd'], ['h', 'i', 'j'], ['n', 'o', 'p']]



Batch: 1
False
X [['d', 'e', 'f'], ['j', 'k', 'l'], ['p', 'q', 'r']]
Y [['e', 'f', 'g'], ['k', 'l', 'm'], ['q', 'r', 's']]





In [43]:
from __future__ import print_function, division

import os
import sys
import re
import pdb
import time

import numpy as np
import scipy as sp
import tensorflow as tf

class RNN:
    
    def __init__(self, hidden_size, sequence_length, vocab_size, learning_rate):
        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.learning_rate = learning_rate
        
        # Xavier
        self.U = np.random.normal(size=[vocab_size, hidden_size], scale=1.0 / np.sqrt(hidden_size))  # ... input projection
        self.W = np.random.normal(size=[hidden_size, hidden_size], scale=1.0 / np.sqrt(hidden_size))  # ... hidden-to-hidden projection
        self.b = np.zeros([1, hidden_size])
        
        
        self.V = np.random.normal(size=[hidden_size, vocab_size], scale=1.0 / np.sqrt(vocab_size))  # ... output projection
        self.c = np.zeros([1, vocab_size]) # ... output bias

        
        # memory of past gradients - rolling sum of squares for Adagrad
        self.memory_U, self.memory_W, self.memory_V = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        self.memory_b, self.memory_c = np.zeros_like(self.b), np.zeros_like(self.c)
    
    
    def rnn_step_forward(self, x, h_prev,  U, W, b):
        # A single time step forward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.

        # x - input data (minibatch size x input dimension)
        # h_prev - previous hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1).T
        
        h_current = np.tanh(np.dot(h_prev, W) + np.dot(x, U) + b)
        cache = (W, x, h_prev, h_current)
        return h_current, cache
    
    def rnn_forward(self, x, h0,  U, W, b):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity

        # x - input data for the whole time-series (minibatch size x sequence_length x input dimension)
        # h0 - initial hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1).T
        
        h, cache = [h0], []
        for t in range(self.sequence_length):
            data = x[:, t, :] #t-th entry
            current_h, current_cache = self.rnn_step_forward(data, h[-1], U, W, b)
            h.append(current_h)
            cache.append(current_cache)


        # return the hidden states for the whole time series (T+1) and a tuple of values needed for the backward step
        h = np.array(h[1:]).transpose((1, 0, 2)) # skip initial state
        return h, cache

    
    
    def rnn_step_backward(self, grad_next, cache):
        # A single time step backward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.

        # grad_next - upstream gradient of the loss with respect to the next hidden state and current output
        # cache - cached information from the forward pass
        
        W, x, h_prev, h_curr = cache
        dz = grad_next * (1 - h_curr**2)
        
        dh_prev = np.dot(dz, W.T)
        dU = np.dot(x.T, dz)
        dW = np.dot(h_prev.T, dz)
        db = np.sum(dz, axis=0)
        
        return dh_prev, dU, dW, db


    def rnn_backward(self, dh, cache):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity
        dU, dW, db = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.b)
        

        # compute and return gradients with respect to each parameter
        # for the whole time series.
        upstream_grad = np.zeros_like(dh[0])
        for dh_t, cache_t in reversed(list(zip(dh, cache))):
            upstream_grad, dU_t, dW_t, db_t = self.rnn_step_backward(dh_t + upstream_grad, cache_t)
            dU += dU_t
            dW += dW_t
            db += db_t; 

        clip = lambda x: np.clip(x, -5, 5)
        return clip(dU), clip(dW), clip(db)
    
    
    def output(self, h, V, c):
        # Calculate the output probabilities of the network
        return np.dot(h, V) + c
    
    def output_probas(self, h, V, c):
        return self.softmax(self.output(h, V, c))
    
    def softmax(self, o):
        exp = np.exp(o)
        s = exp / np.sum(exp, axis=1, keepdims=True)
        return s
    
    def output_loss_and_grads(self, h, V, c, y):
        # Calculate the loss of the network for each of the outputs

        # h - hidden states of the network for each timestep. 
        #     the dimensionality of h is (batch size x sequence length x hidden size (the initial state is irrelevant for the output)
        # V - the output projection matrix of dimension hidden size x vocabulary size
        # c - the output bias of dimension vocabulary size x 1
        # y - the true class distribution - a one-hot vector of dimension 
        #     vocabulary size x 1 - you need to do this conversion prior to
        #     passing the argument. A fast way to create a one-hot vector from
        #     an id could be something like the following code:

        #   y[timestep] = np.zeros((vocabulary_size, 1))
        #   y[timestep][batch_y[timestep]] = 1

        #     where y might be a dictionary.

        loss, dh, dV, dc = 0.0, [], np.zeros_like(self.V), np.zeros_like(self.c)
        batch_size = len(h)
        
        for t in range(self.sequence_length):
            yp = y[:, t, :]
            h_t = h[:, t, :]
            
            o = self.output(h_t, V, c)
            s = self.softmax(o)
            
            loss += -np.sum(np.log(s)*yp) / batch_size
            dO = (s - yp) / batch_size
            
            dV += np.dot(h_t.T, dO)
            dc += np.sum(dO, axis=0)
            
            dh_t = np.dot(dO, V.T)
            dh.append(dh_t)
            
        return loss, dh, dV, dc
    
    
    
    def update(self, batch_size, dU, dW, db, dV, dc):
        eps = 1e-7
        
        # perform the Adagrad update of parameters
        params = [self.U, self.W, self.b, self.V, self.c]
        ders = [dU, dW, db, dV, dc]
        mems = [self.memory_U, self.memory_W, self.memory_b, self.memory_V, self.memory_c]
        
        for x, dx, mem_x in zip(params, ders, mems):
            mem_x += np.square(dx)
            x -= self.learning_rate * dx / np.sqrt(mem_x + eps)

        
    def step(self, h, x, y):
        h, cache = self.rnn_forward(x, h, self.U, self.W, self.b)
        loss, dh, dV, dc = self.output_loss_and_grads(h, self.V, self.c, y)
        dU, dW, db = self.rnn_backward(dh, cache)
        self.update(len(x), dU, dW, db, dV, dc)
        return loss, h[:, -1, :]



In [44]:
from __future__ import print_function, division

import os
import sys
import re
import pdb
import time

import numpy as np
import scipy as sp
import tensorflow as tf
from dataset import Dataset
#from rnn import RNN


def sample(rnn, seed, n_sample, dataset):
    h0 = np.zeros([1, rnn.hidden_size])
    seed_oh = dataset.one_hot(dataset.encode(seed))
    
    sampled = []
    
    for c_oh in seed_oh:
        h0, _ = rnn.rnn_step_forward(c_oh.reshape([1, -1]), h0, rnn.U, rnn.W, rnn.b)
        sampled.append(np.argmax(c_oh))
        
    for i in range(len(seed), n_sample):
        prev_out = np.array([sampled[-1]])
        in_oh = dataset.one_hot(prev_out)
        h0, _ = rnn.rnn_step_forward(in_oh, h0, rnn.U, rnn.W, rnn.b)
        
        probas = rnn.output_probas(h0, rnn.V, rnn.c)
        out_char_oh = np.random.choice(range(dataset.vocab_size), p=probas.ravel()) 
        sampled.append(out_char_oh)
  
    return dataset.decode(sampled)

import pickle

def run_language_model(dataset, max_epochs, hidden_size=100, sequence_length=30, learning_rate=1e-1, sample_every=1000, dump_path='./model'):
    
    vocab_size = len(dataset.sorted_chars)
    rnn = RNN(hidden_size, sequence_length, vocab_size, learning_rate)

    current_epoch = 0 
    batch = 0

    h0 = np.zeros((dataset.batch_size, hidden_size))
    cum_loss = 0

    while current_epoch < max_epochs: 
        e, x, y = dataset.next_minibatch()
        
        if e: 
            current_epoch += 1
            h0 = np.zeros((dataset.batch_size, hidden_size))

        # One-hot transform the x and y batches
        x_oh, y_oh = dataset.one_hot(x), dataset.one_hot(y)


        loss, h0 = rnn.step(h0, x_oh, y_oh)
        cum_loss += loss
        
        if batch % sample_every == 0: 
            seed = "HAN:\nIs that good or bad?\n\n"
            n_sample = 300
            sampled = sample(rnn, seed, n_sample, dataset)
            print(''.join(sampled))
            print()
            with open(dump_path, "wb") as f:
                pickle.dump(rnn, f)
                print('> Dumped to:', dump_path)
            
           
        
        if batch % 1000 == 0:
            current_batch = batch % dataset.num_batches
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
            print("epoch: %06d:\tbatch: %4d/%d\t" % (current_epoch, current_batch, dataset.num_batches), end="")
            print("Average_loss: %.4f; Last batch loss: %.4f" % (cum_loss/batch, loss))
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
            
        batch += 1



In [None]:
dataset = Dataset(batch_size=5, sequence_length=30)
dataset.preprocess("dataset/selected_conversations.txt")
dataset.create_minibatches()
rnn = run_language_model(dataset, 100000, sequence_length=dataset.sequence_length)

HAN:
Is that good or bad?

svv.
mh9ANt:h0sst.oh,ks cgh ffe`h'h.ttthmHftdtohvftKohhaNt0whIvk t gIf:to,oAndhmohft,Ntts,et. monketGwsH1NRTs.juNhcmInd.gwsA1.tThh4tC0hTcio 2hgf4thT9Inn. hBssfN:hh9si0t0gssn.hch!uUY'o`fvtMchHHnnKwmIsLt?gwssueWhv6tnco gsnt8hw fttoghhHetogFsneN
yont VdMIHOcwhh se.ohKtst.ghUw

> Dumped to: ./model
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000000:	batch:    0/3947	Average_loss: inf; Last batch loss: 128.1473
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>




HAN:
Is that good or bad?

cYFTOIR.

JEZMAFDR5FFSINKFFRECaAIs tasl

OeANewte thanwaiIlPthams dhot'sklly Fit?

JEEYET:
Yok. TINan's at 'lin
Mhat gnh g iwsy Jant lin yon?
J wIton.on 6ing ton?

JEFuS IA yone sonoy youPs o svint.o ton that doalald .onlik n6stokcank win I yon ghiilll nang to y at. he.

D

> Dumped to: ./model
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000000:	batch: 1000/3947	Average_loss: 89.4941; Last batch loss: 76.7048
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HAN:
Is that good or bad?

EEON:
.

OMACKRIWhighat ctherbee lome, .

JOACB:
Deay.b3y t diyd!!

DoANKALDl?.

DECp'sddive ball aikdasher kibl

DEN:
You bescseml, bot got?

DEH:
Art avim. Cn'emes buy yaan,be.


JEAND:
Dor you wholahlers gore thasmy?

FAED4AKD:
You thaqd..y bustar mfalme wavo  ig blorip

> Dumped to: ./model
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000000:	batch: 2000/3947	Average_loss: 79.2179; Last batch loss: 65.4104
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HAN:
Is that good or bad?

EELTOOR:
Fon't gek'm she Taue. we cow tell.

SO

In [21]:
path = './best_model'
with open(path, 'rb') as f:
    rnn = pickle.load(f)
print("Loaded...")


seed = "HAN:\nYes?\n\n"
n_sample = 300
sampled = sample(rnn, seed, n_sample, dataset)
print(''.join(sampled))


Loaded...
HAN:
Yes?

DETE L ANINICK RIAN:
Every, you was know.

DUKE:
Ag I'm  that hahpu that he sce, this boge gorned. Just fell they hupp home now ever very Luke you? We loke fing and are you get to heperis. But guy being my thoued your to cen yiellr more and tell be thathart time work.

MCKOR WALDAN:
All t


In [20]:
with open("best_model", "wb") as f:
    pickle.dump(rnn, f)
    print('> Dumped')

> Dumped
