In [1]:
from HM_LSTM_Cell import HM_LSTM_Cell
from Multi_HM_LSTM_Cell import Multi_HM_LSTM_Cell
from utils import HM_LSTM_StateTuple
import tensorflow as tf

import configparser
import os
import csv
import pandas as pd

from collections import Counter
import numpy as np

from IPython.display import clear_output
import uuid

In [2]:
checkpoint_dir = 'checkpoints'

In [3]:
####### HYPERPARAMS #######
batch_size = 64
J = 100
emb_dim = 128
hidden_dim = 512
num_layers = 3
output_emb_dim = 512

num_epochs = 125

init_learning_rate = 0.002
minimum_learning_rate = 0.0001
learning_rate_annealing_const = 0.8
learning_rate_epochs_per_annealing = 25
grad_clip = 1.0

init_slope_value = 1.0
max_slope = 5.0
slope_annealing_increase_per_epoch = 0.04  # slope = min(5.0, 1.0 + 0.04 * epoch)

In [4]:
def hosh(x):
    hv = 0
    for c in list(str(x)):
        hv = hv ^ ord(c)
    return hv

In [5]:
####### DATASET PREP #######
config = configparser.ConfigParser()
config.read('.config')

fp = config['default']['fp']
label_colname = config['default']['label_colname']
text_colname = config['default']['text_colname']

provider_column_names = [label_colname, text_colname]
provider_label_values_hashed = [8, 70]

df = pd.read_csv(fp, sep='\t', names=provider_column_names, skiprows=1, quoting=csv.QUOTE_NONE, quotechar='|', escapechar='\\')
df = df[df[label_colname].apply(lambda x: hosh(x) in provider_label_values_hashed)]

In [6]:
def clean(x):
    return ''.join([c for c in x if ord(c) < 128])

clean_text_colname = 'clean_' + text_colname
df[clean_text_colname] = df[text_colname].apply(clean)

In [7]:
def filter_cond(x):
    if len(x) + 3 <= J:
        return True
    else:
        return False

df_filtered = df.copy(deep=True)
df_filtered = df_filtered[df_filtered[clean_text_colname].apply(filter_cond)]

In [8]:
char_counter = Counter()
comments = df_filtered[clean_text_colname].tolist()

for comment_text in comments:
    comment_chars = list(comment_text)
    char_counter.update(comment_chars)

In [9]:
char_counter.most_common()

[(' ', 47490),
 ('e', 22918),
 ('t', 16863),
 ('a', 15616),
 ('o', 15240),
 ('i', 14160),
 ('s', 13762),
 ('n', 13133),
 ('r', 10814),
 ('h', 10299),
 ('l', 9264),
 ('u', 6812),
 ('d', 6590),
 ('m', 5254),
 ('.', 4949),
 ('c', 4777),
 ('y', 4611),
 ('p', 4214),
 ('g', 4094),
 ('w', 4011),
 ('f', 3261),
 ('b', 3033),
 ('k', 2387),
 ('T', 2040),
 ('v', 1936),
 ('!', 1740),
 ('I', 1692),
 ('S', 1421),
 ("'", 1345),
 ('A', 1332),
 ('L', 1116),
 ('N', 1104),
 ('O', 1087),
 (',', 1085),
 ('E', 991),
 ('H', 958),
 ('C', 848),
 ('?', 809),
 ('R', 801),
 ('D', 799),
 ('M', 746),
 ('P', 744),
 ('W', 721),
 ('B', 492),
 ('G', 457),
 ('j', 446),
 ('F', 406),
 ('Y', 392),
 ('U', 364),
 ('"', 328),
 ('K', 320),
 ('x', 267),
 ('0', 255),
 ('-', 214),
 ('z', 212),
 ('J', 203),
 ('2', 190),
 (':', 171),
 ('V', 135),
 ('1', 128),
 (')', 93),
 ('3', 76),
 ('4', 66),
 ('q', 64),
 ('#', 56),
 ('5', 56),
 ('(', 49),
 ('/', 45),
 ('7', 40),
 ('9', 37),
 ('6', 36),
 ('8', 36),
 ('=', 34),
 ('%', 31),
 ('X', 2

In [10]:
vocab = [k for k, v in char_counter.items() if v >= 40]

go = '\x00'
end_of_text = '\x01'
pad = '\x02'
end_of_padded_comment = '\x03'
unk = '\x04'

vocab.append(go)
vocab.append(end_of_text)
vocab.append(pad)
vocab.append(end_of_padded_comment)
vocab.append(unk)

vocab = sorted(vocab, key=lambda c: ord(c))

In [11]:
int2char = {i: c for i, c in enumerate(vocab)}
char2int = {c: i for i, c in enumerate(vocab)}

In [12]:
char2int

{'\x00': 0,
 '\x01': 1,
 '\x02': 2,
 '\x03': 3,
 '\x04': 4,
 ' ': 5,
 '!': 6,
 '"': 7,
 '#': 8,
 "'": 9,
 '(': 10,
 ')': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '7': 22,
 ':': 23,
 '?': 24,
 'A': 25,
 'B': 26,
 'C': 27,
 'D': 28,
 'E': 29,
 'F': 30,
 'G': 31,
 'H': 32,
 'I': 33,
 'J': 34,
 'K': 35,
 'L': 36,
 'M': 37,
 'N': 38,
 'O': 39,
 'P': 40,
 'R': 41,
 'S': 42,
 'T': 43,
 'U': 44,
 'V': 45,
 'W': 46,
 'Y': 47,
 'a': 48,
 'b': 49,
 'c': 50,
 'd': 51,
 'e': 52,
 'f': 53,
 'g': 54,
 'h': 55,
 'i': 56,
 'j': 57,
 'k': 58,
 'l': 59,
 'm': 60,
 'n': 61,
 'o': 62,
 'p': 63,
 'q': 64,
 'r': 65,
 's': 66,
 't': 67,
 'u': 68,
 'v': 69,
 'w': 70,
 'x': 71,
 'y': 72,
 'z': 73}

In [13]:
def standardize(x):
    token_list = []
    token_list.append(go)
    token_list.extend(list(x))
    token_list.append(end_of_text)
    token_list.extend([pad for _ in range(0, max(0, J-len(x)-2))])
    token_list.append(end_of_padded_comment)
    return ''.join(token_list)[0:(J+1)]

standardized_text_colname = 'standardized_' + text_colname
df_filtered[standardized_text_colname] = df_filtered[clean_text_colname].apply(standardize)

In [14]:
def token2int(x):
    return [(char2int[c] if c in char2int else char2int[unk]) for c in x]

comment_int_colname = 'comment_ints'
df_filtered[comment_int_colname] = df_filtered[standardized_text_colname].apply(token2int)

nr_filtered_provider_records = df_filtered.shape[0]
dataset_size = batch_size * (nr_filtered_provider_records // batch_size)

In [15]:
V = len(char2int)

In [16]:
print(V)

74


In [17]:
####### Dataset format - inspect #######
comment_ints_batch = df_filtered[comment_int_colname].iloc[0:batch_size].tolist()
comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)
xs_batch = comment_ints_batch[:,0:J]
ys_batch = comment_ints_batch[:,1:(J+1)]
print("Example:\n")
print("input chars:")
print(xs_batch[0,:])
print("\npredict chars:")
print(ys_batch[0,:])

Example:

input chars:
[ 0 32 56 59 59 48 65 72  5 27 59 56 61 67 62 61  9 66  5 52 60 48 56 59
 66  5 63 65 62 69 52  5 66 55 52  5 56 66  5 48  5 50 62 65 65 68 63 67
  5 54 59 62 49 48 59 56 66 67  5 65 56 66 62 67 67 62 13 52 48 67 52 65
  6  6  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2]

predict chars:
[32 56 59 59 48 65 72  5 27 59 56 61 67 62 61  9 66  5 52 60 48 56 59 66
  5 63 65 62 69 52  5 66 55 52  5 56 66  5 48  5 50 62 65 65 68 63 67  5
 54 59 62 49 48 59 56 66 67  5 65 56 66 62 67 67 62 13 52 48 67 52 65  6
  6  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  3]


In [18]:
###### HM-LSTM #######
class Model:

    def __init__(self, batch_size, J, V, emb_dim, hidden_dim, output_emb_dim, num_layers, 
                 grad_clip):
        
        tf.reset_default_graph()
        #if sampling == True:
        #    batch_size, J = 1, 1
        #else:
        #    batch_size, J = batch_size, J
        
        self.V = V
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.output_emb_dim = output_emb_dim
        self.num_layers = num_layers
        self.grad_clip = grad_clip
        
        self.learning_rate = tf.placeholder(dtype=tf.float32, shape=[])
        self.slope_annealing_placeholder = tf.placeholder(dtype=tf.float32, shape=[])
        self.xs = tf.placeholder(tf.int32, [batch_size, J])
        self.ys = tf.placeholder(tf.int32, [batch_size, J])
        self.is_train = tf.placeholder(tf.bool)
        
        self.emb_initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
        self.emb_mat = tf.get_variable(
            name='emb_mat', dtype=tf.float32, shape=[self.V, self.emb_dim],
            initializer=self.emb_initializer
        )
        self.emb_xs = tf.nn.embedding_lookup(self.emb_mat, self.xs)

        self.multi_cell = Multi_HM_LSTM_Cell([
            HM_LSTM_Cell(
                num_units=self.hidden_dim, 
                slope_annealing_placeholder=self.slope_annealing_placeholder,
                forget_bias=1.0)
            for _ in range(0, self.num_layers)
        ])
        
        self.zero_state = self.multi_cell.zero_state(batch_size, tf.float32)
        
        # during deployment, our initial state should change as we compose something 
        # one token at a time. hence, we use a state_placeholder.
        #
        # since we can't pass state tuples around (booooo) we have package it as an array 
        # and then turn it back into a statetuple before feeding it to dynamic_rnn
        #
        # during training, we still wish to use zero state as the initial state.
        # i tried to use a tf.where statement to do this, but tensorflow is stupid and 
        # it tries to package the output of tf.where as a tensor, and cannot handle tuples.
        #
        # as a workaround, we will get it via sess.run(model.zero_state) 
        # and then feed it to the state placeholder. 
        
        self.state_placeholder = tf.placeholder(
            tf.float32, [num_layers, batch_size, 2 * self.hidden_dim + 1])

        # placeholder-derived tensors, tuples, etc.
        layer_states = tf.unstack(self.state_placeholder, axis=0)
        tuple_of_layer_states = tuple([
            HM_LSTM_StateTuple(
                c=layer_states[idx][:, 0:self.hidden_dim], 
                h=layer_states[idx][:, self.hidden_dim:(2*self.hidden_dim)], 
                z=layer_states[idx][:, (2*self.hidden_dim):]
            )
            for idx in range(0, num_layers)
        ])

        self.outputs, self.state = tf.nn.dynamic_rnn(
            cell=self.multi_cell, inputs=self.emb_xs, initial_state=tuple_of_layer_states)

        h_layer1, h_layer2, h_layer3 = self.outputs

        h_layer1_per_char = tf.reshape(h_layer1, [-1, self.hidden_dim])
        h_layer2_per_char = tf.reshape(h_layer2, [-1, self.hidden_dim])
        h_layer3_per_char = tf.reshape(h_layer3, [-1, self.hidden_dim])

        h_out_per_char = tf.concat(
            [h_layer1_per_char, h_layer2_per_char, h_layer3_per_char], 1)
        
        g1 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g2 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g3 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)

        self.output_emb = tf.layers.dense(
            tf.concat([
                g1 * h_layer1_per_char, 
                g2 * h_layer2_per_char, 
                g3 * h_layer3_per_char
            ], 1), 
            units=self.output_emb_dim, 
            use_bias=False,
            activation=None
        )
        self.output_emb = tf.maximum(0.10 * self.output_emb, self.output_emb)
        
        with tf.variable_scope('logit_layer'):
            self.logits_kernel = tf.get_variable(name='logits_kernel', 
                shape=[self.output_emb_dim, self.V])
    
        self.logits = tf.matmul(self.output_emb, self.logits_kernel)
        self.probabilities = tf.nn.softmax(self.logits)

        self.loss = tf.losses.sparse_softmax_cross_entropy(
            labels=self.ys,
            logits=tf.reshape(self.logits, self.ys.get_shape().as_list() + [V])
        )

        tvars = tf.trainable_variables()
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        gradients, _ = zip(*optimizer.compute_gradients(self.loss, tvars))
        gradients, _ = tf.clip_by_global_norm(gradients, grad_clip)
        self.train_op = optimizer.apply_gradients(zip(gradients, tvars))

In [19]:
##### RUN EXPERIMENT #####

In [20]:
trial_uuid = str(uuid.uuid4())
trial_dir = os.path.join(checkpoint_dir, trial_uuid)
os.mkdir(trial_dir)

In [21]:
print(trial_dir)

checkpoints/b6e2019e-bb61-4468-9d9f-a2b94529eadd


In [22]:
model = Model(batch_size, J, V, emb_dim, hidden_dim, output_emb_dim, num_layers, grad_clip)

init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

saver = tf.train.Saver(max_to_keep=5)

In [23]:
zero_state = sess.run(model.zero_state)

In [24]:
def convert_multilayer_statetuple_to_array(multilayer_statetuple):
    current_state_array = np.stack([
        np.concatenate([layer_state.c, layer_state.h, layer_state.z], -1) 
        for layer_state in multilayer_statetuple
    ])
    return current_state_array

In [25]:
zero_state_array = convert_multilayer_statetuple_to_array(zero_state)

In [26]:
zero_state_array.shape

(3, 64, 1025)

In [None]:
learning_rate = init_learning_rate
slope_value = init_slope_value

for epoch in range(0, num_epochs):
    
    if epoch > 0 and (epoch % learning_rate_epochs_per_annealing == 0):
        learning_rate = max(
            learning_rate_annealing_const * learning_rate, minimum_learning_rate)
    
    if epoch > 0:
        slope_value = min(
            slope_value + slope_annealing_increase_per_epoch, max_slope)
        
    df_filtered = df_filtered.sample(frac=1).reset_index(drop=True)
    
    for i in range(0, dataset_size // batch_size):
        batch_idx = i * batch_size
        
        start_idx = batch_idx
        end_idx = start_idx + batch_size

        df_batch = df_filtered.iloc[start_idx:end_idx]
        comment_ints_batch = df_batch[comment_int_colname].tolist()
        comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)

        xs_batch = comment_ints_batch[:,0:J]
        ys_batch = comment_ints_batch[:,1:(J+1)]

        feed_dict = {model.xs: xs_batch, 
                     model.ys: ys_batch, 
                     model.slope_annealing_placeholder: slope_value, 
                     model.learning_rate: learning_rate, 
                     model.state_placeholder: zero_state_array
                    }

        _, loss_batch = sess.run(
            [model.train_op, model.loss], 
            feed_dict=feed_dict
        )
        
        print("Epoch {} / {}... step {}...\t training loss: {}".format(
                epoch, num_epochs, i, loss_batch))

    if epoch > 0 and (epoch % 2 == 0):
        clear_output(wait=True)
        
        model_fp = os.path.join(
            trial_dir, "hm_lstm_L{}_e{}_h{}_o{}_epoch{}.ckpt".format(
                num_layers, emb_dim, hidden_dim, output_emb_dim, epoch))

        saver.save(sess, model_fp)

Epoch 17 / 125... step 0...	 training loss: 0.556446373462677
Epoch 17 / 125... step 1...	 training loss: 0.5849212408065796
Epoch 17 / 125... step 2...	 training loss: 0.5369722843170166
Epoch 17 / 125... step 3...	 training loss: 0.5312057137489319
Epoch 17 / 125... step 4...	 training loss: 0.5473288893699646
Epoch 17 / 125... step 5...	 training loss: 0.47616544365882874
Epoch 17 / 125... step 6...	 training loss: 0.582590639591217
Epoch 17 / 125... step 7...	 training loss: 0.5158740878105164
Epoch 17 / 125... step 8...	 training loss: 0.46368148922920227
Epoch 17 / 125... step 9...	 training loss: 0.49126532673835754
Epoch 17 / 125... step 10...	 training loss: 0.5119099020957947
Epoch 17 / 125... step 11...	 training loss: 0.5184198617935181
Epoch 17 / 125... step 12...	 training loss: 0.4928800165653229
Epoch 17 / 125... step 13...	 training loss: 0.5215612649917603
Epoch 17 / 125... step 14...	 training loss: 0.48292914032936096
Epoch 17 / 125... step 15...	 training loss: 0.5

In [38]:
# e = num_epochs
e = 0

saver.save(
    sess, 
    os.path.join(trial_dir, "hm_lstm_L{}_e{}_h{}_o{}_epoch{}.ckpt".format(
        num_layers, emb_dim, hidden_dim, output_emb_dim, e))
)

'checkpoints/62f2c40d-b4aa-43d9-8eb6-83feccdb8b79/hm_lstm_L3_e128_h512_o512_epoch0.ckpt'

In [39]:
sess.close()

In [54]:
def pick_top_n(preds, nr_chars, top_n=4):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(nr_chars, 1, p=p)[0]
    return c

CONTROL_CHARS = [go, end_of_text, pad, end_of_padded_comment]

def sample(checkpoint, max_sample_len, hidden_dim, V, prime="The "):
    
    model = Model(1, 1, V, emb_dim, hidden_dim, output_emb_dim, num_layers, 
                  grad_clip)
    
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        zero_state = sess.run(model.zero_state)
        current_state = convert_multilayer_statetuple_to_array(zero_state)
        
        prime = ''.join([go] + list(clean(prime)))
        samples = []
        samples.extend([c for c in prime])
        
        x = np.zeros((sample_batch_size, 1))
        
        for i in range(0,len(prime)):
            c = prime[i]
            x[0,0] = char2int[c]
            
            feed = {model.xs: x,
                    model.slope_annealing_placeholder: 5.0,
                    model.state_placeholder: current_state}
            preds_ch, new_state_tuple = sess.run(
                [model.probabilities, model.state], 
                feed_dict=feed)
            
            char_id = pick_top_n(preds_ch, nr_chars=V, top_n=4)
            current_state = convert_multilayer_statetuple_to_array(new_state_tuple)
            
        if int2char[char_id] in CONTROL_CHARS:
            return ''.join(samples)
        else:
            samples.append(int2char[char_id])

        for i in range(len(prime), max_sample_len):
            x[0,0] = char_id
            
            feed = {model.xs: x,
                    model.slope_annealing_placeholder: 5.0,
                    model.state_placeholder: current_state}
            preds_ch, new_state_tuple = sess.run(
                [model.probabilities, model.state], 
                feed_dict=feed)
                
            char_id = pick_top_n(preds_ch, nr_chars=V, top_n=2)
            current_state = convert_multilayer_statetuple_to_array(new_state_tuple)
            
            if int2char[char_id] in CONTROL_CHARS:
                break
            else:
                samples.append(int2char[char_id])
    
    return ''.join(samples)

In [55]:
checkpoint = tf.train.latest_checkpoint(trial_dir)

In [56]:
checkpoint

'checkpoints/62f2c40d-b4aa-43d9-8eb6-83feccdb8b79/hm_lstm_L3_e128_h512_o512_epoch0.ckpt'

In [57]:
samp = sample(checkpoint, 100, hidden_dim, V, prime="Ok dude... ")

INFO:tensorflow:Restoring parameters from checkpoints/62f2c40d-b4aa-43d9-8eb6-83feccdb8b79/hm_lstm_L3_e128_h512_o512_epoch0.ckpt


In [59]:
samp

'\x00ok dude... ihe the ae te aee te tee toe ae toe te teee atee toe te tee te to at aee ate te tee te ae'

In [60]:
samp = sample(checkpoint, 100, hidden_dim, V, prime="")

INFO:tensorflow:Restoring parameters from checkpoints/62f2c40d-b4aa-43d9-8eb6-83feccdb8b79/hm_lstm_L3_e128_h512_o512_epoch0.ckpt


In [61]:
samp

'\x00oee tee te ate to ate ate ate at te aee ae tee at tat at ateete at te te ae ate tee to at ate te toe'