In [1]:
import tensorflow as tf
import numpy as np
import random
import pandas as pd
sess = tf.InteractiveSession()
random.seed(1234)

In [2]:
names = pd.read_csv('./all.csv')['content']
names = names.apply(lambda x: x.split('\r\n'))

In [3]:
names = list(names)

In [4]:
names = sum(names, [])
names = list(map(lambda x: x.strip(' ').lower(), names))
names = list(filter(lambda x: len(x) > 0, names))

In [5]:
len(names)

13651

In [6]:
# names = np.loadtxt('/home/nikita/tmp/Practical_RL/week7_[recap]_rnn/names', dtype=str, delimiter=';').tolist()

In [7]:
random.shuffle(names)

In [8]:
names[:5]

['in the gloom. . .',
 'that in that paleness beautys white we see;',
 'and of the priest eftsoons gan to inquire,',
 'the lining purple silk, with gilt stars drawn;',
 'as i see it hardening;']

In [9]:
len(names)

13651

In [10]:
chars = sorted(set(''.join(names)+'_'))
dic = {x:i for i,x in enumerate(chars)}

In [11]:
class DistLSTM:
    def __init__(self, dim, name='DistLSTM', reuse=None):
        self.dim = dim
        self.name = name
        self.reuse = reuse
        
        with tf.variable_scope(self.name, reuse=reuse):
            cells = [tf.nn.rnn_cell.LSTMCell(128, 
                                                name='cell_{}'.format(i), 
                                                activation=tf.nn.tanh) for i in range(3)]
            self.cell = tf.nn.rnn_cell.MultiRNNCell(cells)
            self.post_cell = lambda x: self.dense(x, dim, name='d1')
            self.init_dist = tf.get_variable('init_dist',[1,dim], trainable=True,
                                             initializer=tf.random_normal_initializer(stddev=0.01, mean=0.2))
        
    def forward_string_lookup(self, strings, dic):
        inp = strings
        
        forward_lookup = tf.py_func(lambda x: self._convert_to_ix(x, dic), [inp], tf.int64)
        forward_lookup = tf.reshape(forward_lookup, (-1,))
        forward_lookup = tf.nn.embedding_lookup(tf.diag(tf.ones(len(chars))), forward_lookup)
        forward_lookup = tf.reshape(forward_lookup, [tf.shape(inp)[0], -1, len(chars)])
        forward_lookup = tf.cast(forward_lookup, tf.float32)
        return forward_lookup
    
    def dense(self, inp, dim, name='dense'):
        with tf.variable_scope(name, initializer=tf.random_normal_initializer(stddev=0.01)):
            W = tf.get_variable('W', [inp.shape[-1], dim])
            b = tf.get_variable('b', [1, dim])
            out = tf.matmul(inp, W) + b
        return out
    
    def logdens(self, seq):
        with tf.variable_scope(self.name, reuse=self.reuse):
            batch_size, s_len = tf.shape(seq)[0], tf.shape(seq)[1]

            cell = self.cell

            s_t = tf.transpose(seq, [1,0,2])
            init_state = cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            init = (tf.zeros([batch_size, cell.state_size[0][0]]), init_state)
            out,_ = tf.scan(lambda prev, x: cell(x, prev[1]), s_t, initializer=init)
            out = tf.transpose(out, [1,0,2])
            
            out_dim = out.shape
            
            out = tf.reshape(out, [-1, out_dim[-1]])
            out = self.post_cell(out)
            out = tf.reshape(out, [batch_size, s_len, self.dim])
            
            preds = out[:,:-1]
            target = seq[:,1:]
                        
            init_logits = tf.tile(self.init_dist, [batch_size,1])
            init_nll = tf.nn.softmax_cross_entropy_with_logits_v2(labels=seq[:,0], logits=init_logits)
            init_nll = init_nll[:,tf.newaxis]
            
            nll = tf.nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=preds)
            nll = tf.concat([init_nll, nll], axis=1)
            return -nll
        
    def sample(self):
        with tf.variable_scope(self.name, reuse=self.reuse):
            init_sample = tf.distributions.Multinomial(total_count=1., logits=self.init_dist).sample()
            
            cell = self.cell

            init_state = cell.zero_state(batch_size=1, dtype=tf.float32)

            init = (init_sample, init_state)
            
            def step(prev):
                x = prev[0]
                state = prev[1]
                cell_step = cell(x, state)
                post_step = self.post_cell(cell_step[0])
                post_step = tf.distributions.Multinomial(total_count=1., logits=post_step).sample()
                return post_step, cell_step[1]
            
            out,_ = tf.scan(lambda prev, _: step(prev), tf.range(10), initializer=init)
            out = tf.transpose(out, [1,0,2])
            out = tf.concat([init_sample[:,tf.newaxis,:], out], axis=1)
            return out                
            
    def backward_string_lookup(self, encs, dic):
        encs = tf.cast(encs, tf.bool)
        strs = tf.py_func(lambda x: self._convert_from_enc(x, dic), [encs], tf.string)
        return strs
        
    @staticmethod
    def _convert_to_ix(names, dic):
        if type(names[0]) != str:
            names = list(map(lambda x: x.decode('utf-8'), names))
        chars = []
        max_len = max([len(x) for x in names])
        filler = dic['_']
        for name in names:
            chars.append([])
            for s in name:
                chars[-1].append(dic[s])
            chars[-1] += [filler]*(max_len-len(name))
        return np.array(chars)
    
    @staticmethod
    def _convert_from_enc(encs, dic):
        rev_dic = {i:x for x,i in dic.items()}
        ret = []
        for row in encs:
            table = np.array([range(len(dic))]*len(row))
            ixs = table[row]
            chars = [rev_dic[ix] for ix in ixs]
            string = ''.join(chars)
            ret.append(string)
        return np.array(ret)

In [12]:
len(names)

13651

In [13]:
train = names[:13000]
test = names[13000:]

In [14]:
train_data = tf.Variable(np.array(train), trainable=False, name='train_data')
test_data = tf.Variable(np.array(test), trainable=False, name='test_data')

shuffle_op = tf.assign(train_data, tf.random_shuffle(train_data))
train_crop = tf.random_crop(train_data, [30])
# test_data = tf.random_crop(train_data, [0])

In [15]:
dlstm = DistLSTM(len(chars), reuse=tf.AUTO_REUSE)

In [16]:
fwd_lk = dlstm.forward_string_lookup(train_crop, dic)

fwd_lk_test = dlstm.forward_string_lookup(test_data, dic)

train_loss = -tf.reduce_mean(dlstm.logdens(fwd_lk))

test_loss = -tf.reduce_mean(dlstm.logdens(fwd_lk_test))

In [17]:
sample = dlstm.sample()
reconstr = dlstm.backward_string_lookup(sample, dic)

In [18]:
!rm -R /tmp/tfdbg

In [19]:
train_sum = tf.summary.scalar('train_loss', train_loss)
tf.summary.scalar('test_loss', test_loss)
summary = tf.summary.merge_all()
!mkdir /tmp/tfdbg
writer = tf.summary.FileWriter('/tmp/tfdbg/0')

In [20]:
opt = tf.train.AdamOptimizer(0.001).minimize(train_loss)

In [21]:
tf.global_variables_initializer().run()

In [22]:
reconstr.eval()

array([b'j0&bu,xxiwo'], dtype=object)

In [None]:
for epoch in range(1000):
    for batch in range(200):
        opt.run()
        if batch % 20 == 0:
            s = train_sum.eval()
            writer.add_summary(s)

    print(train_loss.eval())
    s = summary.eval()
    writer.add_summary(s)
    shuffle_op.eval()
    print(reconstr.eval())

2.1157277
[b'/jf_lh_ o  ']
0.5734027
[b"'ts ho toy_"]
1.9748956
[b"c3ddy_' oh "]
2.006459
[b'rpa,aeef th']
1.5927196
[b'}fd hes not']
1.0691086
[b'weee efich ']
1.6127661
[b'wtibh sorke']
1.5318669
[b'fid logo?__']
1.4753472
[b'thagh in te']
1.3890625
[b'to beunt il']
1.5561079
[b'-dsere nle ']
0.7121343
[b'yow it the ']
1.037643
[b'6oud this t']
0.95067656
[b'-jnes mume ']
1.3855321
[b"sa'rling sh"]
1.2619938
[b'and the nom']
1.0225868
[b'but for kea']
1.3813763
[b"y've dlest "]
0.35452494
[b"and ull'r: "]
0.39608467
[b'igtart cont']
1.2318597
[b'sgeed of pr']
1.1482967
[b'that we whe']
1.3252058
[b'whithe a ra']
