In [1]:
from HM_LSTM_Cell import HM_LSTM_Cell
from Multi_HM_LSTM_Cell import Multi_HM_LSTM_Cell
import tensorflow as tf
import pandas as pd
import csv
import numpy as np
from collections import Counter
from IPython.display import clear_output
import configparser

In [2]:
####### HYPERPARAMS #######
batch_size = 64
J = 100
emb_dim = 128
hidden_dim = 512
num_layers = 3
output_emb_dim = 512

num_epochs = 125

init_learning_rate = 0.002
minimum_learning_rate = 0.0001
learning_rate_annealing_const = 0.8
learning_rate_epochs_per_annealing = 25
grad_clip = 1.0

init_slope_value = 1.0
max_slope = 5.0
slope_annealing_increase_per_epoch = 0.04  # slope = min(5.0, 1.0 + 0.04 * epoch)

In [3]:
def hosh(x):
    hv = 0
    for c in list(str(x)):
        hv = hv ^ ord(c)
    return hv

In [4]:
####### DATASET PREP #######
config = configparser.ConfigParser()
config.read('.config')

fp = config['default']['fp']
label_colname = config['default']['label_colname']
text_colname = config['default']['text_colname']

provider_column_names = [label_colname, text_colname]
provider_label_values_hashed = [8, 70]

df = pd.read_csv(fp, sep='\t', names=provider_column_names, skiprows=1, quoting=csv.QUOTE_NONE, quotechar='|', escapechar='\\')
df = df[df[label_colname].apply(lambda x: hosh(x) in provider_label_values_hashed)]

In [5]:
def clean(x):
    return ''.join([c.lower() for c in x if ord(c) < 128])

clean_text_colname = 'clean_' + text_colname
df[clean_text_colname] = df[text_colname].apply(clean)

In [6]:
def filter_cond(x):
    if len(x) + 3 <= J:
        return True
    else:
        return False

df_filtered = df.copy(deep=True)
df_filtered = df_filtered[df_filtered[clean_text_colname].apply(filter_cond)]

In [7]:
char_counter = Counter()
comments = df_filtered[clean_text_colname].tolist()

for comment_text in comments:
    comment_chars = list(comment_text)
    char_counter.update(comment_chars)

In [8]:
char_counter.most_common()

[(' ', 47490),
 ('e', 23909),
 ('t', 18903),
 ('a', 16948),
 ('o', 16327),
 ('i', 15852),
 ('s', 15183),
 ('n', 14237),
 ('r', 11615),
 ('h', 11257),
 ('l', 10380),
 ('d', 7389),
 ('u', 7176),
 ('m', 6000),
 ('c', 5625),
 ('y', 5003),
 ('p', 4958),
 ('.', 4949),
 ('w', 4732),
 ('g', 4551),
 ('f', 3667),
 ('b', 3525),
 ('k', 2707),
 ('v', 2071),
 ('!', 1740),
 ("'", 1345),
 (',', 1085),
 ('?', 809),
 ('j', 649),
 ('"', 328),
 ('x', 296),
 ('0', 255),
 ('z', 225),
 ('-', 214),
 ('2', 190),
 (':', 171),
 ('1', 128),
 (')', 93),
 ('q', 88),
 ('3', 76),
 ('4', 66),
 ('#', 56),
 ('5', 56),
 ('(', 49),
 ('/', 45),
 ('7', 40),
 ('9', 37),
 ('6', 36),
 ('8', 36),
 ('=', 34),
 ('%', 31),
 ('*', 21),
 ('&', 19),
 (';', 19),
 ('>', 11),
 ('~', 10),
 ('@', 9),
 ('$', 7),
 ('<', 6),
 ('^', 6),
 ('+', 3),
 ('_', 2),
 (']', 1),
 ('`', 1),
 ('[', 1)]

In [9]:
vocab = [k for k, v in char_counter.items() if v > 90]

go = '\x00'
end_of_text = '\x01'
pad = '\x02'
end_of_padded_comment = '\x03'
unk = '\x04'

vocab.append(go)
vocab.append(end_of_text)
vocab.append(pad)
vocab.append(end_of_padded_comment)
vocab.append(unk)

vocab = sorted(vocab, key=lambda c: ord(c))

In [10]:
int2char = {i: c for i, c in enumerate(vocab)}
char2int = {c: i for i, c in enumerate(vocab)}

In [11]:
int2char

{0: '\x00',
 1: '\x01',
 2: '\x02',
 3: '\x03',
 4: '\x04',
 5: ' ',
 6: '!',
 7: '"',
 8: "'",
 9: ')',
 10: ',',
 11: '-',
 12: '.',
 13: '0',
 14: '1',
 15: '2',
 16: ':',
 17: '?',
 18: 'a',
 19: 'b',
 20: 'c',
 21: 'd',
 22: 'e',
 23: 'f',
 24: 'g',
 25: 'h',
 26: 'i',
 27: 'j',
 28: 'k',
 29: 'l',
 30: 'm',
 31: 'n',
 32: 'o',
 33: 'p',
 34: 'r',
 35: 's',
 36: 't',
 37: 'u',
 38: 'v',
 39: 'w',
 40: 'x',
 41: 'y',
 42: 'z'}

In [12]:
char2int

{'\x00': 0,
 '\x01': 1,
 '\x02': 2,
 '\x03': 3,
 '\x04': 4,
 ' ': 5,
 '!': 6,
 '"': 7,
 "'": 8,
 ')': 9,
 ',': 10,
 '-': 11,
 '.': 12,
 '0': 13,
 '1': 14,
 '2': 15,
 ':': 16,
 '?': 17,
 'a': 18,
 'b': 19,
 'c': 20,
 'd': 21,
 'e': 22,
 'f': 23,
 'g': 24,
 'h': 25,
 'i': 26,
 'j': 27,
 'k': 28,
 'l': 29,
 'm': 30,
 'n': 31,
 'o': 32,
 'p': 33,
 'r': 34,
 's': 35,
 't': 36,
 'u': 37,
 'v': 38,
 'w': 39,
 'x': 40,
 'y': 41,
 'z': 42}

In [13]:
def standardize(x):
    token_list = []
    token_list.append(go)
    token_list.extend(list(x))
    token_list.append(end_of_text)
    token_list.extend([pad for _ in range(0, max(0, J-len(x)-2))])
    token_list.append(end_of_padded_comment)
    return ''.join(token_list)[0:(J+1)]

standardized_text_colname = 'standardized_' + text_colname
df_filtered[standardized_text_colname] = df_filtered[clean_text_colname].apply(standardize)

In [14]:
def token2int(x):
    return [(char2int[c] if c in char2int else char2int[unk]) for c in x]

comment_int_colname = 'comment_ints'
df_filtered[comment_int_colname] = df_filtered[standardized_text_colname].apply(token2int)

nr_filtered_provider_records = df_filtered.shape[0]
dataset_size = batch_size * (nr_filtered_provider_records // batch_size)

In [15]:
V = len(char2int)

In [16]:
print(V)

43


In [17]:
####### Dataset format - inspect #######
comment_ints_batch = df_filtered[comment_int_colname].iloc[0:batch_size].tolist()
comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)
xs_batch = comment_ints_batch[:,0:J]
ys_batch = comment_ints_batch[:,1:(J+1)]
print("Example:\n")
print("input chars:")
print(xs_batch[0,:])
print("\npredict chars:")
print(ys_batch[0,:])

Example:

input chars:
[ 0 25 26 29 29 18 34 41  5 20 29 26 31 36 32 31  8 35  5 22 30 18 26 29
 35  5 33 34 32 38 22  5 35 25 22  5 26 35  5 18  5 20 32 34 34 37 33 36
  5 24 29 32 19 18 29 26 35 36  5 34 26 35 32 36 36 32 11 22 18 36 22 34
  6  6  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2]

predict chars:
[25 26 29 29 18 34 41  5 20 29 26 31 36 32 31  8 35  5 22 30 18 26 29 35
  5 33 34 32 38 22  5 35 25 22  5 26 35  5 18  5 20 32 34 34 37 33 36  5
 24 29 32 19 18 29 26 35 36  5 34 26 35 32 36 36 32 11 22 18 36 22 34  6
  6  1  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  3]


In [18]:
df_filtered[comment_int_colname].iloc[0:batch_size]

0     [0, 25, 26, 29, 29, 18, 34, 41, 5, 20, 29, 26,...
1     [0, 25, 26, 29, 29, 18, 34, 41, 5, 26, 35, 5, ...
2     [0, 24, 32, 5, 19, 22, 34, 31, 26, 22, 10, 5, ...
3     [0, 31, 32, 5, 30, 32, 34, 22, 5, 35, 18, 23, ...
4     [0, 25, 26, 29, 29, 18, 34, 41, 5, 20, 29, 26,...
5     [0, 19, 22, 34, 31, 26, 22, 5, 35, 18, 31, 21,...
6     [0, 41, 32, 37, 5, 29, 26, 19, 36, 18, 34, 21,...
7     [0, 26, 36, 8, 35, 5, 18, 29, 29, 5, 33, 18, 3...
8     [0, 30, 18, 28, 22, 5, 32, 37, 34, 5, 41, 32, ...
9     [0, 35, 22, 34, 26, 32, 37, 35, 29, 41, 10, 5,...
10    [0, 39, 25, 18, 36, 5, 25, 18, 33, 33, 22, 31,...
11    [0, 25, 32, 39, 5, 21, 32, 5, 18, 31, 41, 5, 3...
12    [0, 26, 8, 30, 5, 35, 22, 34, 26, 32, 37, 35, ...
13    [0, 36, 34, 37, 30, 33, 5, 39, 26, 29, 29, 5, ...
14    [0, 36, 34, 37, 30, 33, 5, 26, 35, 5, 18, 5, 3...
15    [0, 36, 34, 37, 30, 33, 5, 26, 35, 5, 35, 32, ...
16    [0, 19, 18, 31, 31, 32, 31, 5, 26, 35, 5, 36, ...
17    [0, 35, 18, 21, 29, 41, 10, 5, 4, 15, 5, 3

In [19]:
###### HM-LSTM #######
class Model:

    def __init__(self, batch_size, J, V, emb_dim, hidden_dim, output_emb_dim, num_layers, 
                 grad_clip, sampling):
        
        tf.reset_default_graph()
        if sampling == True:
            batch_size, J = 1, 1
        else:
            batch_size, J = batch_size, J
        
        self.V = V
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.output_emb_dim = output_emb_dim
        self.num_layers = num_layers
        self.grad_clip = grad_clip
        
        self.learning_rate = tf.placeholder(dtype=tf.float32, shape=[])
        self.slope_annealing_placeholder = tf.placeholder(dtype=tf.float32, shape=[])
        self.xs = tf.placeholder(tf.int32, [batch_size, J])
        self.ys = tf.placeholder(tf.int32, [batch_size, J])
        
        self.emb_initializer = tf.random_uniform_initializer(minval=-0.1, maxval=0.1)
        self.emb_mat = tf.get_variable(
            name='emb_mat', dtype=tf.float32, shape=[self.V, self.emb_dim],
            initializer=self.emb_initializer
        )
        self.emb_xs = tf.nn.embedding_lookup(self.emb_mat, self.xs)
        
        #self.emb_xs = tf.one_hot(self.xs, self.V)

        self.multi_cell = Multi_HM_LSTM_Cell(
            [HM_LSTM_Cell(
                num_units=self.hidden_dim, 
                slope_annealing_placeholder=self.slope_annealing_placeholder,
                forget_bias=1.0)
             for _ in range(0, self.num_layers)]
        )
        '''
        self.multi_cell = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.BasicLSTMCell(num_units=self.hidden_dim, forget_bias=1.0) 
             for _ in range(0, self.num_layers)]
        )
        '''
        
        self.initial_state = self.multi_cell.zero_state(batch_size, tf.float32)            
        self.outputs, self.state = tf.nn.dynamic_rnn(
            cell=self.multi_cell, inputs=self.emb_xs, initial_state=self.initial_state)

        self.h_layer1, self.h_layer2, self.h_layer3 = self.outputs

        print("each h_layer output should have shape [batch_size, timesteps, hidden dim]")
        print(self.h_layer1.get_shape().as_list())
        print(self.h_layer2.get_shape().as_list())
        print(self.h_layer3.get_shape().as_list())

        self.h_layer1_per_char = tf.reshape(self.h_layer1, [-1, self.hidden_dim])
        self.h_layer2_per_char = tf.reshape(self.h_layer2, [-1, self.hidden_dim])
        self.h_layer3_per_char = tf.reshape(self.h_layer3, [-1, self.hidden_dim])

        h_out_per_char = tf.concat(
            [self.h_layer1_per_char, self.h_layer2_per_char, self.h_layer3_per_char], 1)
        
        g1 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g2 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g3 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)

        self.output_emb = tf.layers.dense(
            tf.concat([
                g1 * self.h_layer1_per_char, 
                g2 * self.h_layer2_per_char, 
                g3 * self.h_layer3_per_char
            ], 1), 
            units=self.output_emb_dim, 
            use_bias=False,
            activation=None
        )
        self.output_emb = tf.maximum(0.10 * self.output_emb, self.output_emb)
        
        with tf.variable_scope('logit_layer'):
            logits_kernel = tf.get_variable(name='logits_kernel', 
                shape=[self.output_emb_dim, self.V])
    
        self.logits = tf.matmul(self.output_emb, logits_kernel)
        self.probabilities = tf.nn.softmax(self.logits)

        self.loss = tf.losses.sparse_softmax_cross_entropy(
            labels=self.ys,
            logits=tf.reshape(self.logits, self.ys.get_shape().as_list() + [V])
        )

        #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        tvars = tf.trainable_variables()
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        gradients, _ = zip(*optimizer.compute_gradients(self.loss, tvars))
        gradients, _ = tf.clip_by_global_norm(gradients, grad_clip)
        self.train_op = optimizer.apply_gradients(zip(gradients, tvars))

In [20]:
model = Model(batch_size, J, V, emb_dim, hidden_dim, output_emb_dim, num_layers, 
    grad_clip, sampling=False)

init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

each h_layer output should have shape [batch_size, timesteps, hidden dim]
[64, 100, 512]
[64, 100, 512]
[64, 100, 512]


In [None]:
learning_rate = init_learning_rate
slope_value = init_slope_value

for epoch in range(0, num_epochs):
    
    if epoch > 0 and (epoch % learning_rate_epochs_per_annealing == 0):
        
        learning_rate = max(
            minimum_learning_rate, 
            learning_rate * learning_rate_annealing_const)
    
    if epoch > 0:
        slope_value = min(
            max_slope, 
            slope_value + slope_annealing_increase_per_epoch)
    
    df_filtered = df_filtered.sample(frac=1).reset_index(drop=True)
    
    for i in range(0, dataset_size // batch_size):
        batch_idx = i * batch_size
        
        start_idx = batch_idx
        end_idx = start_idx + batch_size

        df_batch = df_filtered.iloc[start_idx:end_idx]
        comment_ints_batch = df_batch[comment_int_colname].tolist()
        comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)

        xs_batch = comment_ints_batch[:,0:J]
        ys_batch = comment_ints_batch[:,1:(J+1)]

        feed_dict = {model.xs: xs_batch, 
                     model.ys: ys_batch, 
                     model.slope_annealing_placeholder: slope_value, 
                     model.learning_rate: learning_rate}

        _, loss_batch, rnn_outputs_, probs_, h_layer1_, hlayer1_tf_reshaped_ = sess.run(
            [model.train_op, model.loss, model.outputs, model.probabilities, 
             model.h_layer1, model.h_layer1_per_char], 
            feed_dict=feed_dict
        )
        
        print("Epoch {} / {}... step {}...\t training loss: {}".format(
                epoch, num_epochs, i, loss_batch))
        '''
        if i % 10 == 0:
            temp_df = pd.DataFrame()
            temp_df['comments'] = df_filtered[standardized_text_colname].iloc[start_idx:end_idx]
            #temp_df['rnn_outputs'] = [np.array(row) for row in np.array(rnn_outputs_[2]).tolist()]
            #probs_ = np.reshape(probs_, [-1, J, V])
            #print(probs_[0:5, :, 0:4])
            #print(temp_df.head())
            
            print(h_layer1_[0:5, 0, 0])
            # ^ for first five batch items 
            # show me the model's lowest layer state 
            # at vector location zero 
            # immediately after the first go token 
            # in each of the 5 sequences
            # the values should all match (very important!)
            
            print(hlayer1_tf_reshaped_[0, 0])  
            print(hlayer1_tf_reshaped_[100, 0])
            print(hlayer1_tf_reshaped_[200, 0])
            print(hlayer1_tf_reshaped_[300, 0])
            print(hlayer1_tf_reshaped_[400, 0])
            # ^and reshaping shouldnt screw anything up
            
            print("*" * 80)
            
            print("\nthese should all match: ")
            print(probs_[0, 0])
            print(probs_[100, 0])
            print(probs_[200, 0])
            print(probs_[300, 0])
            print(probs_[400, 0])
            # ^these should all match 
            
            print("\nand reshaping shouldnt screw anything up")
            probs_np_reshaped_ = np.reshape(probs_, [batch_size, J, V])
            print(probs_np_reshaped_[0:5, 0, 0])
        '''
            
    if epoch > 0 and (epoch % 2 == 0):
        clear_output(wait=True)

Epoch 0 / 125... step 0...	 training loss: 3.7579402923583984
Epoch 0 / 125... step 1...	 training loss: 3.672825813293457
Epoch 0 / 125... step 2...	 training loss: 3.5305662155151367
Epoch 0 / 125... step 3...	 training loss: 5.38958740234375
Epoch 0 / 125... step 4...	 training loss: 8.181283950805664
Epoch 0 / 125... step 5...	 training loss: 4.687193393707275
Epoch 0 / 125... step 6...	 training loss: 2.1547369956970215
Epoch 0 / 125... step 7...	 training loss: 2.261915445327759
Epoch 0 / 125... step 8...	 training loss: 2.1636273860931396
Epoch 0 / 125... step 9...	 training loss: 2.331103801727295
Epoch 0 / 125... step 10...	 training loss: 1.9706655740737915
Epoch 0 / 125... step 11...	 training loss: 1.8968554735183716
Epoch 0 / 125... step 12...	 training loss: 1.970605731010437
Epoch 0 / 125... step 13...	 training loss: 2.0801782608032227
Epoch 0 / 125... step 14...	 training loss: 1.9405460357666016
Epoch 0 / 125... step 15...	 training loss: 1.9388964176177979
Epoch 0 / 

Epoch 1 / 125... step 48...	 training loss: 1.3399581909179688
Epoch 1 / 125... step 49...	 training loss: 1.3892285823822021
Epoch 1 / 125... step 50...	 training loss: 1.417523741722107
Epoch 1 / 125... step 51...	 training loss: 1.358782172203064
Epoch 1 / 125... step 52...	 training loss: 1.3883548974990845
Epoch 1 / 125... step 53...	 training loss: 1.3688204288482666
Epoch 1 / 125... step 54...	 training loss: 1.2653846740722656
Epoch 1 / 125... step 55...	 training loss: 1.2888848781585693
Epoch 1 / 125... step 56...	 training loss: 1.2563011646270752
Epoch 1 / 125... step 57...	 training loss: 1.32978355884552
Epoch 1 / 125... step 58...	 training loss: 1.3307737112045288
Epoch 1 / 125... step 59...	 training loss: 1.3456006050109863
Epoch 1 / 125... step 60...	 training loss: 1.2946584224700928
Epoch 1 / 125... step 61...	 training loss: 1.3899832963943481
Epoch 1 / 125... step 62...	 training loss: 1.2682607173919678
Epoch 1 / 125... step 63...	 training loss: 1.3103616237640

In [20]:
import os
checkpoint_dir = 'checkpoints'

In [22]:
import os

# e = num_epochs
e = 6 

saver = tf.train.Saver(max_to_keep=5)
saver.save(sess, os.path.join(checkpoint_dir, "hm_lstm_L{}_h{}_e{}.ckpt".format(
                    num_layers, hidden_dim, e)))

'checkpoints/hm_lstm_L3_h350_e6.ckpt'

In [23]:
sess.close()

In [28]:
def pick_top_n(preds, nr_chars, top_n=4):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(nr_chars, 1, p=p)[0]
    return c

CONTROL_CHARS = [go, end_of_text, pad, end_of_padded_comment]

def sample(checkpoint, J, hidden_dim, V, prime="The "):
    
    samples = [c for c in prime]
    model = Model(batch_size, J, V, emb_dim, hidden_dim, output_emb_dim, num_layers, 
                  grad_clip, sampling=False)
    
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        zero_state = sess.run(model.initial_state)
        
        prime = ''.join([go] + list(prime))
        
        x = np.zeros((batch_size, J))
        
        for i in range(0,len(prime)):
            c = prime[i]
            x[:,i] = np.array([char2int[c] for _ in range(0, batch_size)])
            feed = {model.xs: x,
                    model.slope_annealing_placeholder: 5.0,
                    model.initial_state: zero_state}
            preds_ch, new_state = sess.run(
                [model.probabilities, model.state], 
                feed_dict=feed)
            
            preds_ch = np.reshape(preds_ch, [batch_size, J, V])[0, i, :]
            char_id = pick_top_n(preds_ch, nr_chars=V, top_n=4)
            
        if int2char[char_id] in CONTROL_CHARS:
            return ''.join(samples)
        else:
            samples.append(int2char[char_id])

        for i in range(len(prime), J):
            x[:,i] = np.array([char_id for _ in range(0, batch_size)])
            feed = {model.xs: x,
                    model.slope_annealing_placeholder: 5.0,
                    model.initial_state: zero_state}
            preds_ch, new_state = sess.run(
                [model.probabilities, model.state], 
                feed_dict=feed)
                
            preds_ch = np.reshape(preds_ch, [batch_size, J, V])[0, i, :]
            char_id = pick_top_n(preds_ch, nr_chars=V, top_n=2)
            if int2char[char_id] in CONTROL_CHARS:
                break
            else:
                samples.append(int2char[char_id])
    
    return ''.join(samples)

In [29]:
checkpoint = tf.train.latest_checkpoint(checkpoint_dir)

In [30]:
checkpoint

'checkpoints/hm_lstm_L3_h350_e6.ckpt'

In [31]:
samp = sample(checkpoint, 100, hidden_dim, V, prime="")

each h_layer output should have shape [batch_size, timesteps, hidden dim]
[64, 100, 350]
[64, 100, 350]
[64, 100, 350]
INFO:tensorflow:Restoring parameters from checkpoints/hm_lstm_L3_h350_e6.ckpt


In [32]:
samp

'Toee     eee e    ttt      '

In [33]:
samp = sample(checkpoint, 100, hidden_dim, V, prime="")

each h_layer output should have shape [batch_size, timesteps, hidden dim]
[64, 100, 350]
[64, 100, 350]
[64, 100, 350]
INFO:tensorflow:Restoring parameters from checkpoints/hm_lstm_L3_h350_e6.ckpt


In [34]:
samp

'Toe     e    e e   ttttt   '