In [1]:
from HM_LSTM_Cell import HM_LSTM_Cell
from Multi_HM_LSTM_Cell import Multi_HM_LSTM_Cell
import tensorflow as tf
import pandas as pd
import csv
import numpy as np
from collections import Counter
from IPython.display import clear_output
import configparser

In [2]:
####### HYPERPARAMS #######
batch_size = 40
J = 100
emb_dim = 128
hidden_dim = 350
num_layers = 3

learning_rate = 0.0013
grad_clip = 5.0

init_slope_value = 1.0
max_slope = 5.0
slope_annealing_increase_per_epoch = 0.04

In [3]:
def hosh(x):
    hv = 0
    for c in list(str(x)):
        hv = hv ^ ord(c)
    return hv

In [4]:
####### DATASET PREP #######
config = configparser.ConfigParser()
config.read('.config')

fp = config['default']['fp']
label_colname = config['default']['label_colname']
text_colname = config['default']['text_colname']

provider_column_names = [label_colname, text_colname]
provider_label_values_hashed = [8, 70]

df = pd.read_csv(fp, sep='\t', names=provider_column_names, skiprows=1, quoting=csv.QUOTE_NONE, quotechar='|', escapechar='\\')
df = df[df[label_colname].apply(lambda x: hosh(x) in provider_label_values_hashed)]

In [5]:
comments = df[text_colname].tolist()
counter = Counter()

for comment in comments:
    counter.update(list(comment))

vocab = [k for k, v in counter.items() if v >= 20]

go = '\x00'
end_of_text = '\x01'
pad = '\x02'
end_of_padded_comment = '\x03'
unk = '\x04'

vocab.append(go)
vocab.append(end_of_text)
vocab.append(pad)
vocab.append(end_of_padded_comment)
vocab.append(unk)

int2char = {i: c for i, c in enumerate(vocab)}
char2int = {c: i for i, c in enumerate(vocab)}

In [6]:
clean_text_colname = 'clean_' + text_colname
standardized_text_colname = 'standardized_' + text_colname

def clean(x):
    return ''.join([c for c in x if ord(c) < 128])

def standardize(x):
    token_list = []
    token_list.append(go)
    token_list.extend(list(x))
    token_list.append(end_of_text)
    token_list.extend([pad for _ in range(0, max(0, J-len(x)-2))])
    token_list.append(end_of_padded_comment)
    return ''.join(token_list)[0:(J+1)]

def filter_cond(x):
    if len(x) + 3 <= J:
        return True
    else:
        return False

def token2int(x):
    return [(char2int[c] if c in char2int else char2int[unk]) for c in x]

df[clean_text_colname] = df[text_colname].apply(clean)

df_filtered = df.copy(deep=True)
df_filtered = df_filtered[df_filtered[clean_text_colname].apply(filter_cond)]
df_filtered[standardized_text_colname] = df_filtered[clean_text_colname].apply(standardize)
df_filtered['comment_ints'] = df_filtered[standardized_text_colname].apply(token2int)

nr_filtered_provider_records = df_filtered.shape[0]
dataset_size = batch_size * (nr_filtered_provider_records // batch_size)

In [7]:
V = len(char2int)

In [8]:
print(V)

86


In [9]:
####### Dataset format - inspect #######
comment_ints_batch = df_filtered['comment_ints'].iloc[0:batch_size].tolist()
comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)
xs_batch = comment_ints_batch[:,0:J]
ys_batch = comment_ints_batch[:,1:(J+1)]
print("Example:\n")
print("input chars:")
print(xs_batch[0,:])
print("\npredict chars:")
print(ys_batch[0,:])

Example:

input chars:
[81 37 34 76 76 57 44 56 11 50 76 34 60 55 40 60 21 63 11 49 13 57 34 76
 63 11 42 44 40 35 49 11 63 77 49 11 34 63 11 57 11  2 40 44 44 26 42 55
 11 39 76 40 14 57 76 34 63 55 11 44 34 63 40 55 55 40 69 49 57 55 49 44
 73 73 82 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83
 83 83 83 83]

predict chars:
[37 34 76 76 57 44 56 11 50 76 34 60 55 40 60 21 63 11 49 13 57 34 76 63
 11 42 44 40 35 49 11 63 77 49 11 34 63 11 57 11  2 40 44 44 26 42 55 11
 39 76 40 14 57 76 34 63 55 11 44 34 63 40 55 55 40 69 49 57 55 49 44 73
 73 82 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83
 83 83 83 84]


In [10]:
###### HM-LSTM #######
class Model:

    def __init__(self, batch_size, J, V, emb_dim, hidden_dim, num_layers, 
                 learning_rate, grad_clip, sampling):
        
        tf.reset_default_graph()
        if sampling == True:
            batch_size, J = 1, 1
        else:
            batch_size, J = batch_size, J
        
        self.V = V
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.grad_clip = grad_clip
            
        self.slope_annealing_placeholder = tf.placeholder(dtype=tf.float32, shape=[])
        self.xs = tf.placeholder(tf.int32, [batch_size, J])
        self.ys = tf.placeholder(tf.int32, [batch_size, J])
        
        self.emb_mat = tf.get_variable(
            name='emb_mat', dtype=tf.float32, shape=[self.V, self.emb_dim])
        
        self.emb_xs = tf.nn.embedding_lookup(self.emb_mat, self.xs)
        #self.emb_xs = tf.one_hot(self.xs, self.V)

        self.multi_cell = Multi_HM_LSTM_Cell(
            [HM_LSTM_Cell(
                num_units=self.hidden_dim, 
                slope_annealing_placeholder=self.slope_annealing_placeholder,
                forget_bias=1.0)
             for _ in range(0, self.num_layers)]
        )
        '''
        self.multi_cell = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.BasicLSTMCell(num_units=self.hidden_dim, forget_bias=1.0) 
             for _ in range(0, self.num_layers)]
        )
        '''
        
        self.initial_state = self.multi_cell.zero_state(batch_size, tf.float32)
        self.outputs, self.state = tf.nn.dynamic_rnn(
            cell=self.multi_cell, inputs=self.emb_xs, initial_state=self.initial_state)

        h_layer1, h_layer2, h_layer3 = self.outputs

        print("each h_layer output should have shape [batch_size, timesteps, hidden dim]")
        print(h_layer1.get_shape().as_list())
        print(h_layer2.get_shape().as_list())
        print(h_layer3.get_shape().as_list())

        h_layer1_per_char = tf.reshape(h_layer1, [-1, self.hidden_dim])
        h_layer2_per_char = tf.reshape(h_layer2, [-1, self.hidden_dim])
        h_layer3_per_char = tf.reshape(h_layer3, [-1, self.hidden_dim])

        h_out_per_char = tf.concat([h_layer1_per_char, h_layer2_per_char, h_layer3_per_char], 1)

        g1 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g2 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)
        g3 = tf.layers.dense(h_out_per_char, units=1, use_bias=False, activation=tf.nn.sigmoid)

        self.output_emb = tf.layers.dense(
            tf.concat([
                g1 * h_layer1_per_char, 
                g2 * h_layer2_per_char, 
                g3 * h_layer3_per_char
            ], 1), 
            units=self.hidden_dim, 
            use_bias=False, 
            activation=tf.nn.relu
        )
        input_to_logits = self.output_emb
        very_fancy_dim = self.num_layers * self.hidden_dim
        '''
        very_fancy_dim = hidden_dim
        '''

        very_fancy_output = tf.reshape(self.outputs, [-1, very_fancy_dim])
        with tf.variable_scope('softmax'):
            softmax_w = tf.Variable(tf.truncated_normal((very_fancy_dim, self.V), stddev=0.1))
    
        self.logits = tf.matmul(very_fancy_output, softmax_w)
        self.probabilities = tf.nn.softmax(self.logits)

        self.loss = tf.losses.sparse_softmax_cross_entropy(
            labels=self.ys,
            logits=tf.reshape(self.logits, self.ys.get_shape().as_list() + [V])
        )

        #train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
        tvars = tf.trainable_variables()
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients, _ = zip(*optimizer.compute_gradients(self.loss, tvars))
        gradients, _ = tf.clip_by_global_norm(gradients, grad_clip)
        self.train_op = optimizer.apply_gradients(zip(gradients, tvars))

In [None]:
nr_epochs = 10

model = Model(batch_size, J, V, emb_dim, hidden_dim, num_layers, 
    learning_rate, grad_clip, sampling=False)

init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

for epoch in range(0, nr_epochs):
    slope_value = min(max_slope, init_slope_value + slope_annealing_increase_per_epoch * epoch)
    df_filtered = df_filtered.sample(frac=1).reset_index(drop=True)
    
    for i in range(0, dataset_size // batch_size):
        batch_idx = i * batch_size

        comment_ints_batch = df_filtered['comment_ints'].iloc[batch_idx:batch_idx+batch_size].tolist()
        comment_ints_batch = np.array(comment_ints_batch, dtype=np.int32)

        xs_batch = comment_ints_batch[:,0:J]
        ys_batch = comment_ints_batch[:,1:(J+1)]

        feed_dict = {model.xs: xs_batch, 
                     model.ys: ys_batch, 
                     model.slope_annealing_placeholder: slope_value}

        _, loss_batch = sess.run(
            [model.train_op, model.loss], feed_dict=feed_dict
        )
        
        if i % 10 == 0:
            print("Epoch {} / {}... step {}...\t training loss: {}".format(
                epoch, nr_epochs, i, loss_batch))
        
        if epoch > 0 and (epoch % 10 == 0):
            clear_output(wait=True)

each h_layer output should have shape [batch_size, timesteps, hidden dim]
[40, 100, 350]
[40, 100, 350]
[40, 100, 350]
Epoch 0 / 10... step 0...	 training loss: 4.4473557472229
Epoch 0 / 10... step 10...	 training loss: 4.994361400604248
Epoch 0 / 10... step 20...	 training loss: 2.7366459369659424
Epoch 0 / 10... step 30...	 training loss: 2.3921823501586914
Epoch 0 / 10... step 40...	 training loss: 2.514673948287964
Epoch 0 / 10... step 50...	 training loss: 2.5279386043548584
Epoch 0 / 10... step 60...	 training loss: 2.4569594860076904
Epoch 0 / 10... step 70...	 training loss: 2.4191956520080566
Epoch 0 / 10... step 80...	 training loss: 2.478078603744507
Epoch 0 / 10... step 90...	 training loss: 2.70499587059021
Epoch 0 / 10... step 100...	 training loss: 2.4354286193847656
Epoch 0 / 10... step 110...	 training loss: 2.3667025566101074
Epoch 0 / 10... step 120...	 training loss: 2.58023738861084
