In [1]:
from HM_LSTM_Cell import HM_LSTM_Cell
from Multi_HM_LSTM_Cell import Multi_HM_LSTM_Cell
import tensorflow as tf
import pandas as pd
import csv
import numpy as np
from collections import Counter
from IPython.display import clear_output
import configparser

In [2]:
####### HYPERPARAMS #######
batch_size = 40
J = 100
emb_dim = 128
hidden_dim = 350
num_layers = 3

learning_rate = 0.002
grad_clip = 5.0

init_slope_value = 1.0
max_slope = 5.0
slope_annealing_increase_per_epoch = 0.04

In [3]:
def hosh(x):
    hv = 0
    for c in list(str(x)):
        hv = hv ^ ord(c)
    return hv

In [4]:
####### DATASET PREP #######
config = configparser.ConfigParser()
config.read('.config')

fp = config['default']['fp']
label_colname = config['default']['label_colname']
text_colname = config['default']['text_colname']

provider_column_names = [label_colname, text_colname]
provider_label_values_hashed = [8, 70]

df = pd.read_csv(fp, sep='\t', names=provider_column_names, skiprows=1, quoting=csv.QUOTE_NONE, quotechar='|', escapechar='\\')
df = df[df[label_colname].apply(lambda x: hosh(x) in provider_label_values_hashed)]

In [5]:
def clean(x):
    return ''.join([c for c in x if ord(c) < 128])

clean_text_colname = 'clean_' + text_colname
df[clean_text_colname] = df[text_colname].apply(clean)

In [6]:
def filter_cond(x):
    if len(x) + 3 <= J:
        return True
    else:
        return False

df_filtered = df.copy(deep=True)
df_filtered = df_filtered[df_filtered[clean_text_colname].apply(filter_cond)]

In [7]:
char_counter = Counter()
comments = df_filtered[clean_text_colname].tolist()

for comment_text in comments:
    comment_chars = list(comment_text)
    char_counter.update(comment_chars)

In [8]:
char_counter.most_common()

[(' ', 47490),
 ('e', 22918),
 ('t', 16863),
 ('a', 15616),
 ('o', 15240),
 ('i', 14160),
 ('s', 13762),
 ('n', 13133),
 ('r', 10814),
 ('h', 10299),
 ('l', 9264),
 ('u', 6812),
 ('d', 6590),
 ('m', 5254),
 ('.', 4949),
 ('c', 4777),
 ('y', 4611),
 ('p', 4214),
 ('g', 4094),
 ('w', 4011),
 ('f', 3261),
 ('b', 3033),
 ('k', 2387),
 ('T', 2040),
 ('v', 1936),
 ('!', 1740),
 ('I', 1692),
 ('S', 1421),
 ("'", 1345),
 ('A', 1332),
 ('L', 1116),
 ('N', 1104),
 ('O', 1087),
 (',', 1085),
 ('E', 991),
 ('H', 958),
 ('C', 848),
 ('?', 809),
 ('R', 801),
 ('D', 799),
 ('M', 746),
 ('P', 744),
 ('W', 721),
 ('B', 492),
 ('G', 457),
 ('j', 446),
 ('F', 406),
 ('Y', 392),
 ('U', 364),
 ('"', 328),
 ('K', 320),
 ('x', 267),
 ('0', 255),
 ('-', 214),
 ('z', 212),
 ('J', 203),
 ('2', 190),
 (':', 171),
 ('V', 135),
 ('1', 128),
 (')', 93),
 ('3', 76),
 ('4', 66),
 ('q', 64),
 ('5', 56),
 ('#', 56),
 ('(', 49),
 ('/', 45),
 ('7', 40),
 ('9', 37),
 ('6', 36),
 ('8', 36),
 ('=', 34),
 ('%', 31),
 ('X', 2

In [9]:
vocab = [k for k, v in char_counter.items() if v > 20]

go = '\x00'
end_of_text = '\x01'
pad = '\x02'
end_of_padded_comment = '\x03'
unk = '\x04'

vocab.append(go)
vocab.append(end_of_text)
vocab.append(pad)
vocab.append(end_of_padded_comment)
vocab.append(unk)

vocab = sorted(vocab, key=lambda c: ord(c))

In [10]:
int2char = {i: c for i, c in enumerate(vocab)}
char2int = {c: i for i, c in enumerate(vocab)}

In [11]:
char2int

{'\x00': 0,
 '\x01': 1,
 '\x02': 2,
 '\x03': 3,
 '\x04': 4,
 ' ': 5,
 '!': 6,
 '"': 7,
 '#': 8,
 '%': 9,
 "'": 10,
 '(': 11,
 ')': 12,
 '*': 13,
 ',': 14,
 '-': 15,
 '.': 16,
 '/': 17,
 '0': 18,
 '1': 19,
 '2': 20,
 '3': 21,
 '4': 22,
 '5': 23,
 '6': 24,
 '7': 25,
 '8': 26,
 '9': 27,
 ':': 28,
 '=': 29,
 '?': 30,
 'A': 31,
 'B': 32,
 'C': 33,
 'D': 34,
 'E': 35,
 'F': 36,
 'G': 37,
 'H': 38,
 'I': 39,
 'J': 40,
 'K': 41,
 'L': 42,
 'M': 43,
 'N': 44,
 'O': 45,
 'P': 46,
 'Q': 47,
 'R': 48,
 'S': 49,
 'T': 50,
 'U': 51,
 'V': 52,
 'W': 53,
 'X': 54,
 'Y': 55,
 'a': 56,
 'b': 57,
 'c': 58,
 'd': 59,
 'e': 60,
 'f': 61,
 'g': 62,
 'h': 63,
 'i': 64,
 'j': 65,
 'k': 66,
 'l': 67,
 'm': 68,
 'n': 69,
 'o': 70,
 'p': 71,
 'q': 72,
 'r': 73,
 's': 74,
 't': 75,
 'u': 76,
 'v': 77,
 'w': 78,
 'x': 79,
 'y': 80,
 'z': 81}

In [12]:
def standardize(x):
    token_list = []
    token_list.append(go)
    token_list.extend(list(x))
    token_list.append(end_of_text)
    token_list.extend([pad for _ in range(0, max(0, J-len(x)-2))])
    token_list.append(end_of_padded_comment)
    return ''.join(token_list)[0:(J+1)]

standardized_text_colname = 'standardized_' + text_colname
df_filtered[standardized_text_colname] = df_filtered[clean_text_colname].apply(standardize)

In [13]:
def token2int(x):
    return [(char2int[c] if c in char2int else char2int[unk]) for c in x]

comment_int_colname = 'comment_ints'
df_filtered[comment_int_colname] = df_filtered[standardized_text_colname].apply(token2int)

nr_filtered_provider_records = df_filtered.shape[0]
dataset_size = batch_size * (nr_filtered_provider_records // batch_size)

In [14]:
V = len(char2int)

In [15]:
print(V)

82


In [16]:
####### Dataset format - inspect #######
comment_ints_batch = df_filtered[comment_int_colname].iloc[0:batch_size].tolist()
comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)
xs_batch = comment_ints_batch[:,0:J]
ys_batch = comment_ints_batch[:,1:(J+1)]
print("Example:\n")
print("input chars:")
print(xs_batch[0,:])
print("\npredict chars:")
print(ys_batch[0,:])

Example:

input chars:
[ 0 38 64 67 67 56 73 80  5 33 67 64 69 75 70 69 10 74  5 60 68 56 64 67
 74  5 71 73 70 77 60  5 74 63 60  5 64 74  5 56  5 58 70 73 73 76 71 75
  5 62 67 70 57 56 67 64 74 75  5 73 64 74 70 75 75 70 15 60 56 75 60 73
  6  6  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2]

predict chars:
[38 64 67 67 56 73 80  5 33 67 64 69 75 70 69 10 74  5 60 68 56 64 67 74
  5 71 73 70 77 60  5 74 63 60  5 64 74  5 56  5 58 70 73 73 76 71 75  5
 62 67 70 57 56 67 64 74 75  5 73 64 74 70 75 75 70 15 60 56 75 60 73  6
  6  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  3]


In [17]:
###### HM-LSTM #######
class Model:

    def __init__(self, batch_size, J, V, emb_dim, hidden_dim, num_layers, 
                 learning_rate, grad_clip, sampling):
        
        tf.reset_default_graph()
        if sampling == True:
            batch_size, J = 1, 1
        else:
            batch_size, J = batch_size, J
        
        self.V = V
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.grad_clip = grad_clip
            
        self.slope_annealing_placeholder = tf.placeholder(dtype=tf.float32, shape=[])
        self.xs = tf.placeholder(tf.int32, [batch_size, J])
        self.ys = tf.placeholder(tf.int32, [batch_size, J])
        
        #_initializer = tf.random_uniform_initializer(-0.1, 0.1)
        
        self.emb_mat = tf.get_variable(
            name='emb_mat', dtype=tf.float32, shape=[self.V, self.emb_dim])
        
        self.emb_xs = tf.nn.embedding_lookup(self.emb_mat, self.xs)
        #self.emb_xs = tf.one_hot(self.xs, self.V)

        self.multi_cell = Multi_HM_LSTM_Cell(
            [HM_LSTM_Cell(
                num_units=self.hidden_dim, 
                slope_annealing_placeholder=self.slope_annealing_placeholder,
                forget_bias=1.0)
             for _ in range(0, self.num_layers)]
        )
        '''
        self.multi_cell = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.BasicLSTMCell(num_units=self.hidden_dim, forget_bias=1.0) 
             for _ in range(0, self.num_layers)]
        )
        '''
        
        self.initial_state = self.multi_cell.zero_state(batch_size, tf.float32)
        self.outputs, self.state = tf.nn.dynamic_rnn(
            cell=self.multi_cell, inputs=self.emb_xs, initial_state=self.initial_state)

        h_layer1, h_layer2, h_layer3 = self.outputs

        print("each h_layer output should have shape [batch_size, timesteps, hidden dim]")
        print(h_layer1.get_shape().as_list())
        print(h_layer2.get_shape().as_list())
        print(h_layer3.get_shape().as_list())

        h_layer1_per_char = tf.reshape(h_layer1, [-1, self.hidden_dim])
        h_layer2_per_char = tf.reshape(h_layer2, [-1, self.hidden_dim])
        h_layer3_per_char = tf.reshape(h_layer3, [-1, self.hidden_dim])

        h_out_per_char = tf.concat(
            [h_layer1_per_char, h_layer2_per_char, h_layer3_per_char], 1)

        g1 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g2 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g3 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)

        self.output_emb = tf.layers.dense(
            tf.concat([
                g1 * h_layer1_per_char, 
                g2 * h_layer2_per_char, 
                g3 * h_layer3_per_char
            ], 1), 
            units=self.hidden_dim, 
            use_bias=False, 
            activation=tf.nn.relu
        )
        very_fancy_dim = self.num_layers * self.hidden_dim
        '''
        very_fancy_dim = hidden_dim
        '''

        very_fancy_output = tf.reshape(self.outputs, [-1, very_fancy_dim])
        
        with tf.variable_scope('logit_layer'):
            logits_kernel = tf.get_variable(name='logits_kernel', 
                shape=[very_fancy_dim, self.V])
    
        self.logits = tf.matmul(very_fancy_output, logits_kernel)
        self.probabilities = tf.nn.softmax(self.logits)

        self.loss = tf.losses.sparse_softmax_cross_entropy(
            labels=self.ys,
            logits=tf.reshape(self.logits, self.ys.get_shape().as_list() + [V])
        )

        #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        tvars = tf.trainable_variables()
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        gradients, _ = zip(*optimizer.compute_gradients(self.loss, tvars))
        gradients, _ = tf.clip_by_global_norm(gradients, grad_clip)
        self.train_op = optimizer.apply_gradients(zip(gradients, tvars))

In [None]:
nr_epochs = 40

model = Model(batch_size, J, V, emb_dim, hidden_dim, num_layers, 
    learning_rate, grad_clip, sampling=False)

init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

each h_layer output should have shape [batch_size, timesteps, hidden dim]
[40, 100, 350]
[40, 100, 350]
[40, 100, 350]


In [None]:
for epoch in range(0, nr_epochs):
    slope_value = min(max_slope, init_slope_value + slope_annealing_increase_per_epoch * epoch)
    df_filtered = df_filtered.sample(frac=1).reset_index(drop=True)
    
    for i in range(0, dataset_size // batch_size):
        batch_idx = i * batch_size
        
        start_idx = batch_idx
        end_idx = start_idx + batch_size

        df_batch = df_filtered.iloc[start_idx:end_idx]
        comment_ints_batch = df_batch[comment_int_colname].tolist()
        comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)

        xs_batch = comment_ints_batch[:,0:J]
        ys_batch = comment_ints_batch[:,1:(J+1)]

        feed_dict = {model.xs: xs_batch, 
                     model.ys: ys_batch, 
                     model.slope_annealing_placeholder: slope_value}

        _, loss_batch = sess.run(
            [model.train_op, model.loss], feed_dict=feed_dict
        )
        
        print("Epoch {} / {}... step {}...\t training loss: {}".format(
                epoch, nr_epochs, i, loss_batch))
        
    if epoch > 0 and (epoch % 2 == 0):
        clear_output(wait=True)

Epoch 0 / 40... step 0...	 training loss: 4.407299041748047
Epoch 0 / 40... step 1...	 training loss: 4.388210296630859
Epoch 0 / 40... step 2...	 training loss: 4.308659553527832
Epoch 0 / 40... step 3...	 training loss: 3.6272614002227783
Epoch 0 / 40... step 4...	 training loss: 2.741917371749878
Epoch 0 / 40... step 5...	 training loss: 2.9585046768188477
Epoch 0 / 40... step 6...	 training loss: 2.7560641765594482
Epoch 0 / 40... step 7...	 training loss: 2.686645746231079
Epoch 0 / 40... step 8...	 training loss: 2.7063076496124268
Epoch 0 / 40... step 9...	 training loss: 2.605485200881958
Epoch 0 / 40... step 10...	 training loss: 2.3965840339660645
Epoch 0 / 40... step 11...	 training loss: 2.3593287467956543
Epoch 0 / 40... step 12...	 training loss: 2.2483716011047363
Epoch 0 / 40... step 13...	 training loss: 2.3453991413116455
Epoch 0 / 40... step 14...	 training loss: 2.552450656890869
Epoch 0 / 40... step 15...	 training loss: 2.607212543487549
Epoch 0 / 40... step 16...

Epoch 1 / 40... step 0...	 training loss: 2.342819929122925
Epoch 1 / 40... step 1...	 training loss: 2.4158496856689453
Epoch 1 / 40... step 2...	 training loss: 2.5047764778137207
Epoch 1 / 40... step 3...	 training loss: 2.6267623901367188
Epoch 1 / 40... step 4...	 training loss: 2.39003586769104
Epoch 1 / 40... step 5...	 training loss: 2.4722237586975098
Epoch 1 / 40... step 6...	 training loss: 2.4351279735565186
Epoch 1 / 40... step 7...	 training loss: 2.3943026065826416
Epoch 1 / 40... step 8...	 training loss: 2.515941858291626
Epoch 1 / 40... step 9...	 training loss: 2.3855631351470947
Epoch 1 / 40... step 10...	 training loss: 2.3944480419158936
Epoch 1 / 40... step 11...	 training loss: 2.3388748168945312
Epoch 1 / 40... step 12...	 training loss: 2.034548759460449
Epoch 1 / 40... step 13...	 training loss: 2.328373670578003
Epoch 1 / 40... step 14...	 training loss: 2.650632381439209
Epoch 1 / 40... step 15...	 training loss: 2.4293878078460693
Epoch 1 / 40... step 16..

Epoch 2 / 40... step 0...	 training loss: 2.172239303588867


In [None]:
import os
checkpoint_dir = 'checkpoints'

In [None]:
import os

saver = tf.train.Saver(max_to_keep=5)
saver.save(sess, os.path.join(checkpoint_dir, "hm_lstm_L{}_h{}_e{}.ckpt".format(
                    num_layers, hidden_dim, nr_epochs)))

In [None]:
sess.close()

In [15]:
def pick_top_n(preds, nr_chars, top_n=4):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(nr_chars, 1, p=p)[0]
    return c

CONTROL_CHARS = [go, end_of_text, pad, end_of_padded_comment]

def sample(checkpoint, J, hidden_dim, V, prime="The "):
    
    samples = [c for c in prime]
    model = Model(batch_size, J, V, emb_dim, hidden_dim, num_layers, 
                  learning_rate, grad_clip, sampling=False)
    
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        zero_state = sess.run(model.initial_state)
        
        prime = ''.join([go] + list(prime))
        
        x = np.zeros((batch_size, J))
        
        for i in range(0,len(prime)):
            c = prime[i]
            x[:,i] = np.array([char2int[c] for _ in range(0, batch_size)])
            feed = {model.xs: x,
                    model.slope_annealing_placeholder: 5.0,
                    model.initial_state: zero_state}
            preds_ch, new_state = sess.run(
                [model.probabilities, model.state], 
                feed_dict=feed)
            
            preds_ch = np.reshape(preds_ch, [batch_size, J, V])[0, i, :]
            char_id = pick_top_n(preds_ch, nr_chars=V, top_n=4)
            
        if int2char[char_id] in CONTROL_CHARS:
            return ''.join(samples)
        else:
            samples.append(int2char[char_id])

        for i in range(len(prime), J):
            x[:,i] = np.array([char_id for _ in range(0, batch_size)])
            feed = {model.xs: x,
                    model.slope_annealing_placeholder: 5.0,
                    model.initial_state: zero_state}
            preds_ch, new_state = sess.run(
                [model.probabilities, model.state], 
                feed_dict=feed)
                
            preds_ch = np.reshape(preds_ch, [batch_size, J, V])[0, i, :]
            char_id = pick_top_n(preds_ch, nr_chars=V, top_n=2)
            if int2char[char_id] in CONTROL_CHARS:
                break
            else:
                samples.append(int2char[char_id])
    
    return ''.join(samples)

In [16]:
checkpoint = tf.train.latest_checkpoint(checkpoint_dir)

In [17]:
checkpoint

'checkpoints/hm_lstm_L3_h300_e10.ckpt'

In [18]:
samp = sample(checkpoint, 100, hidden_dim, V, prime="")

each h_layer output should have shape [batch_size, timesteps, hidden dim]
[40, 100, 300]
[40, 100, 300]
[40, 100, 300]
INFO:tensorflow:Restoring parameters from checkpoints/hm_lstm_L3_h300_e10.ckpt


In [19]:
list(samp)

['x',
 'o',
 '!',
 '.',
 '.',
 '.',
 '!',
 '!',
 '!',
 '.',
 '.',
 '.',
 '!',
 '!',
 '.',
 '!',
 '.',
 '!',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '!',
 '.',
 '!',
 '!',
 '.']

In [48]:
samp = sample(checkpoint, 100, hidden_dim, V, prime="")

each h_layer output should have shape [batch_size, timesteps, hidden dim]
[40, 100, 300]
[40, 100, 300]
[40, 100, 300]
INFO:tensorflow:Restoring parameters from checkpoints/hm_lstm_L3_h300_e10.ckpt


In [49]:
samp

'Thee   e e e      e e ee  ee '