In [1]:
import sys
import time
import random
import os.path
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
%run helpers_gru.py

Using TensorFlow backend.


## Parameters

In [3]:
# name of model. Used for saving conventions
name = 'recsys' # 'imusic'

# set sise of data (number of samples). If None (suggested), full datasets are applied.
limit = None

# how often would you like to check results?
show_every_n_batches = 3000

# decide on wether to show full validation statistics. Computational time is high when this is True
full_validation_stats = False

# decide whether to log testing
log_testing = True

# top k products to determine accuracy
top_k = 20

notes = 'Final GRU Model'

## Hyperparameters

In [4]:
# 512 - Number of sequences running through the network in one pass.
batch_size = 512

# 50 - Embedding dimensions
embed_dim = 300

# The dropout drop probability when training on input. If you're network is overfitting, try decreasing this.
x_drop_probability = 0.00

# The dropout keep probability when training on RNN neurons. If you're network is overfitting, try decreasing this.
rnn_keep_probability = 1.00

# 100 - The number of units in the hidden layers.
rnn_size = 200

# 1
num_layers = 1

# Learning rate for training
# typically 0.0001 up to 1: http://datascience.stackexchange.com/questions/410/choosing-a-learning-rate
# best 2017 05 01: learning_rate = 0.0025
learning_rate = 0.0025

# 10 epochs
num_epochs = 50

In [5]:
# create model folder for hyperparameters, statistics and the model itself
model_counter_path = 'models/model_counter.txt'
if os.path.isfile(model_counter_path):
    model_counter_file = open(model_counter_path, 'r')
    model_count = int(model_counter_file.read())
    model_counter_file.close()
    model_counter_file = open(model_counter_path, 'w')
    model_counter_file.write(str(model_count + 1))
    model_counter_file.close()
else:
    model_counter_file = open(model_counter_path, 'w+')
    model_count = 1000 # initial model count/number
    model_counter_file.write(str(model_count + 1))
    model_counter_file.close()

model_path_dir = 'models/' + str(model_count) + '-' + name + '-' + time.strftime("%y%m%d") + '/'
if not os.path.exists(model_path_dir):
    os.makedirs(model_path_dir)

stats_file_path = model_path_dir + name + '-' + time.strftime("%y%m%d%H%M") + '-statsfile' + '.txt'
stats_file = open(stats_file_path, 'w+')
stats_file.write('model number: {}\n'.format(model_count))
stats_file.write('name: {}\n\n'.format(name))
stats_file.write('limit: {}\n'.format(limit))
stats_file.write('batch_size: {}\n'.format(batch_size))
stats_file.write('embed_dim: {}\n'.format(embed_dim))
stats_file.write('x_drop_probability: {}\n'.format(x_drop_probability))
stats_file.write('rnn_keep_probability: {}\n'.format(rnn_keep_probability))
stats_file.write('rnn_size: {}\n'.format(rnn_size))
stats_file.write('num_layers: {}\n'.format(num_layers))
stats_file.write('learning_rate: {}\n'.format(learning_rate))
stats_file.write('num_epochs: {}\n'.format(num_epochs))
stats_file.write('show_every_n_batches: {}\n'.format(show_every_n_batches))
stats_file.write('top_k: {}\n'.format(top_k))
stats_file.write('full_validation_stats: {}\n'.format(full_validation_stats))
stats_file.write('notes: {}\n'.format(notes))
stats_file.close()

## Load Data

In [6]:
if limit == None:
    validation_limit = None
    testing_limit = None
else:
    validation_limit = int(0.2 * limit)
    testing_limit = int(0.2 * limit)

tr_data = load_our_data(path='data/rsc15_train_tr.txt', limit=limit)
va_data = load_our_data(path='data/rsc15_train_valid.txt', limit=validation_limit)
te_data = load_our_data(path='data/rsc15_test.txt', limit=testing_limit)

data/rsc15_train_tr.txt was successfully loaded in!
data/rsc15_train_valid.txt was successfully loaded in!
data/rsc15_test.txt was successfully loaded in!


## Data Preprocessing

In [7]:
# get number of unique products
print('uniques in training  ', np.unique(tr_data['ItemId']).shape[0])
print('uniques in validation', np.unique(va_data['ItemId']).shape[0])
print('uniques in testing   ', np.unique(te_data['ItemId']).shape[0])
uniques = np.unique(np.append(np.append(tr_data['ItemId'], va_data['ItemId']), te_data['ItemId']))
depth = uniques.shape[0]
print('\ndepth (unique items) ', depth)
if depth != np.unique(tr_data['ItemId']).shape[0]:
    print('\nWARNING! Number of uniques in training should equal the depth (uniques in full set)')

uniques in training   37483
uniques in validation 6359
uniques in testing    6751

depth (unique items)  37483


In [8]:
# Creating a lookup table
items_to_int, int_to_items = create_lookup_tables(list(uniques))

In [9]:
# 19 - Number of timesteps the rnn should take in
timesteps = 19

# Transforming and splitting the data
X_tr, y_tr = transform_and_split_our_data(tr_data, timesteps)
X_va, y_va = transform_and_split_our_data(va_data, timesteps)
X_te, y_te = transform_and_split_our_data(te_data, timesteps)

In [10]:
# Tranforming items to an integer ID
X_tr, y_tr = transforming_item_to_int(X_tr, y_tr, items_to_int)
X_va, y_va = transforming_item_to_int(X_va, y_va, items_to_int)
X_te, y_te = transforming_item_to_int(X_te, y_te, items_to_int)

In [11]:
# def variable_summaries(var):
#     """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
#     with tf.name_scope('summaries'):
#         mean = tf.reduce_mean(var)
#         tf.summary.scalar('mean', mean)
#         with tf.name_scope('stddev'):
#             stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
#         tf.summary.scalar('stddev', stddev)
#         tf.summary.scalar('max', tf.reduce_max(var))
#         tf.summary.scalar('min', tf.reduce_min(var))
#         tf.summary.histogram('histogram', var)

## Build Model

In [None]:
# reset any existing graph
tf.reset_default_graph()

# create new graph
graph = tf.Graph()
with graph.as_default():
    items_size = len(int_to_items) + 1

    with tf.name_scope('input'):
        inputs = tf.placeholder(tf.int32, [batch_size, timesteps], name='inputs')
        targets = tf.placeholder(tf.int32, [batch_size, timesteps], name='targets')

    with tf.name_scope("other_placeholders"):
        lr = tf.placeholder(tf.float32, name='learning_rate')
        x_drop_prob = tf.placeholder(tf.float32, name='x_drop_prob')
        rnn_keep_prob = tf.placeholder(tf.float32, name='rnn_keep_prob')
    
    with tf.name_scope("x_dropout"):
        inputs_dropped = tf.layers.dropout(inputs, rate=x_drop_prob)
    
    with tf.name_scope("embedding"):
        embedding = tf.get_variable('embedding_matrix', [items_size, embed_dim])
        rnn_inputs = tf.nn.embedding_lookup(embedding, inputs_dropped)

    with tf.name_scope("cell"):
        cell = tf.contrib.rnn.GRUCell(rnn_size)

    with tf.name_scope("rnn_dropout"):
        cell_dropped = tf.contrib.rnn.DropoutWrapper(cell=cell,
                                                     input_keep_prob=1,
                                                     state_keep_prob=rnn_keep_prob,
                                                     output_keep_prob=rnn_keep_prob,
#                                                      variational_recurrent=True,
                                                     input_size=rnn_size,
                                                     dtype=tf.float32)

    with tf.name_scope("rnn_dropout"):
        rnn_layer = tf.contrib.rnn.MultiRNNCell([cell_dropped] * num_layers)

    with tf.name_scope("initial_state"):
        initial_state = rnn_layer.zero_state(batch_size, tf.int32)
        initial_state = tf.identity(initial_state, name='initial_state')

    with tf.name_scope("rnn_output"):
        rnn_output, final_state = tf.nn.dynamic_rnn(rnn_layer, rnn_inputs, dtype=tf.float32)
        final_state =  tf.identity(final_state, name='final_state')

    with tf.name_scope("fully_connected"):
        logits = tf.contrib.layers.fully_connected(rnn_output,
                                                   items_size,
                                                   activation_fn=None,
                                                   biases_initializer=tf.constant_initializer(0.1))
    
    with tf.name_scope("softmax"):
        # y is our prediction
        probs = tf.nn.softmax(logits, name='probs')
        probs = tf.slice(probs, [0, 0, 1], [-1, -1, -1])
        zeros = tf.zeros([batch_size, timesteps, 1], tf.float32)
        probs = tf.concat([zeros, probs], 2)
    
    with tf.name_scope("masking"):
        # top k predictions: Shape = (batch_size, timesteps, k)
        top_preds_values, top_preds = tf.nn.top_k(probs, k=top_k)

        # making targets a 3D matrix and finding the mask values
        targets_ = tf.tile(tf.expand_dims(targets, 2), [1, 1, top_k])
        mask_3d = tf.sign(tf.to_float(targets_))
        mask_2d = tf.sign(tf.to_float(targets))

        equal_pad = tf.equal(tf.sign(tf.to_float(targets)), 0)
        pad_ints = tf.cast(equal_pad, tf.int32)
        pad_count = tf.reduce_sum(pad_ints)

        multiplier = tf.to_float(tf.divide((tf.multiply(batch_size, timesteps)), (tf.multiply(batch_size, timesteps) - pad_count)))

    with tf.name_scope("accuracy_calc"):
        # calculating accuracy with mask
        correct_pred = tf.equal(top_preds, targets_)
        cor_pred = tf.sign(tf.to_float(correct_pred))
        mask_acc = tf.multiply(mask_3d, cor_pred)
    
    with tf.name_scope('accuracy'):
        accuracy = tf.multiply(tf.reduce_mean(tf.cast(mask_acc, tf.float32)), top_k)
        accuracy_ = tf.multiply(accuracy, multiplier)
    
    with tf.name_scope('loss'):
        loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=targets)
        masked_losses = tf.multiply(mask_2d, loss)
        cost = tf.reduce_mean(masked_losses)

    with tf.name_scope('optimizer'):
        train_op = tf.train.AdamOptimizer(lr).minimize(cost)
    
    with tf.name_scope("saver"):
        saver = tf.train.Saver()
    
    with tf.name_scope("summaries"):
        tf.summary.scalar("loss", cost)
        tf.summary.scalar("accuracy", accuracy_)
        merged = tf.summary.merge_all()

## Train Model

In [None]:
# generate all batches for training, validation and testing data
tr_batches = get_batches(X_tr, y_tr, batch_size)
va_batches = get_batches(X_va, y_va, batch_size)
te_batches = get_batches(X_te, y_te, batch_size)
t0 = time.time() # initialize start of training
etr_h = '-' # initialize estimated time remaining
etr_m = '-'
log_count = 0 # counter for show_every_n_batches

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())

    training_writer = tf.summary.FileWriter(model_path_dir + '/training', graph=graph)
    validation_writer = tf.summary.FileWriter(model_path_dir + '/validation')
    testing_writer = tf.summary.FileWriter(model_path_dir + '/testing')
    
    # print headers of runtime statistics
    print('EPOCH       BATCH       TR LOSS  |  TR ACC  |  VA LOSS  |  VA ACC         TRAINING STATUS')
    stats_file = open(stats_file_path, 'a')
    stats_file.write('\nEPOCH       BATCH       TR LOSS  |  TR ACC  |  VA LOSS  |  VA ACC         TRAINING STATUS\n')
    stats_file.close()

    for epoch_i in range(num_epochs):
        # generate initial state
        state = sess.run(initial_state, {inputs: tr_batches[0][0]})

        # create empty arrays to hold accuracies and losses for training batches
        tr_epoch_acc, tr_epoch_loss, va_loss_array, va_acc_array = [], [], [], []
        
        # train model using the train_op optimizer
        for tr_batch_i, (X_tr_input, y_tr_input) in enumerate(tr_batches):
            feed = {
                inputs: X_tr_input,
                targets: y_tr_input,
                initial_state: state,
                lr: learning_rate,
                x_drop_prob: x_drop_probability,
                rnn_keep_prob: rnn_keep_probability
            }
            state, _ = sess.run([final_state, train_op], feed_dict=feed)

            # show every <show_every_n_batches> batches
            if (epoch_i * len(tr_batches) + tr_batch_i) % show_every_n_batches == 0:
                
                log_count += 1
                
                training_summary, tr_loss, tr_acc = sess.run([merged, cost, accuracy_], feed_dict={inputs: X_tr_input,
                                                                                                          targets: y_tr_input,
                                                                                                          x_drop_prob: 0,
                                                                                                          rnn_keep_prob: 1})
                tr_epoch_loss.append(tr_loss)
                tr_epoch_acc.append(tr_acc)
                
                # full validation check
                if full_validation_stats:
#                     va_loss_array, va_acc_array = [], []
                    for va_batch_i, (X_va_input, y_va_input) in enumerate(va_batches):
                        validation_summary, va_loss, va_acc = sess.run([merged, cost, accuracy_], feed_dict={inputs: X_va_input,
                                                                                                             targets: y_va_input,
                                                                                                             x_drop_prob: 0,
                                                                                                             rnn_keep_prob: 1})
                        va_loss_array.append(va_loss)
                        va_acc_array.append(va_acc)
                    va_loss = sum(va_loss_array) / len(va_loss_array)
                    va_acc = sum(va_acc_array) / len(va_acc_array)
                else:
                    va_batch_i = random.randint(0, len(va_batches) - 1)
                    validation_summary, va_loss, va_acc = sess.run([merged, cost, accuracy_], feed_dict={inputs: va_batches[va_batch_i][0],
                                                                                                         targets: va_batches[va_batch_i][1],
                                                                                                         x_drop_prob: 0,
                                                                                                         rnn_keep_prob: 1})
#                     va_loss_array.append(va_loss)
#                     va_acc_array.append(va_acc)
                
                # useful statistics
                progress = tr_batch_i + (len(tr_batches) * epoch_i) # current batch
                progress_total = len(tr_batches) * num_epochs # total batches
                progress_pct = progress / progress_total * 100
                seconds_spent = (time.time() - t0)
                if progress_pct > 0: # avoid devision by zero
                    etr = int((seconds_spent * (100 / progress_pct - 1)) / 60) # sets estimated time remaining
                    etr_h = int(etr / 60)
                    etr_m = int(etr % 60)
                
                # print all statistics
                print('{:>2}/{:>2}   {:>5}/{}       {:.3f}  |  {:>6.2%}  |    {:.3f}  |  {:>6.2%}       {:>4} m  {:>2}:{:>2} ({:>5.2%})'.format(
                    epoch_i + 1, # for human
                    num_epochs,
                    tr_batch_i,
                    len(tr_batches),
                    tr_loss,
                    tr_acc,
                    va_loss,
                    va_acc,
                    int(seconds_spent / 60),
                    etr_h,
                    etr_m,
                    progress_pct / 100
                ))
                
                # print all statistics to stats_file
                stats_file = open(stats_file_path, 'a')
                stats_file.write('{:>2}/{:>2}   {:>5}/{}       {:.3f}  |  {:>6.2%}  |    {:.3f}  |  {:>6.2%}       {:>4} m  {:>2}:{:>2} ({:>5.2%})\n'.format(
                    epoch_i + 1, # for human
                    num_epochs,
                    tr_batch_i,
                    len(tr_batches),
                    tr_loss,
                    tr_acc,
                    va_loss,
                    va_acc,
                    int(seconds_spent / 60),
                    etr_h,
                    etr_m,
                    progress_pct / 100
                ))
                stats_file.close()
                training_writer.add_summary(training_summary, log_count)
#                 validation_writer.add_summary(validation_summary, log_count)
        
        # add training stats after epoch
#         training_writer.add_summary(training_summary, log_count)
    
        # full validation after each epoch
        validation_summary = None
        va_loss_array, va_acc_array = [], []
        for va_batch_i, (X_va_input, y_va_input) in enumerate(va_batches):
            validation_summary, va_loss, va_acc = sess.run([merged, cost, accuracy_], feed_dict={inputs: X_va_input, targets: y_va_input, x_drop_prob: 0, rnn_keep_prob: 1})
            va_loss_array.append(va_loss)
            va_acc_array.append(va_acc)
        if(log_testing):
            validation_writer.add_summary(validation_summary, log_count)
        
        # full test after each epoch
        testing_summary = None
        te_loss_array, te_acc_array = [], []
        for te_batch_i, (X_te_input, y_te_input) in enumerate(te_batches):
            testing_summary, te_loss, te_acc = sess.run([merged, cost, accuracy_], feed_dict={inputs: X_te_input, targets: y_te_input, x_drop_prob: 0, rnn_keep_prob: 1})
            te_loss_array.append(te_loss)
            te_acc_array.append(te_acc)
        if(log_testing):
            testing_writer.add_summary(testing_summary, log_count)

        # print final epoch statistics
        stats_file = open(stats_file_path, 'a')
        print('\nEpoch {:>2}     Loss  |   Accuracy'.format(epoch_i + 1))
        stats_file.write('\nEpoch {:>2}     Loss  |   Accuracy\n'.format(epoch_i + 1))
        if len(tr_batches) < show_every_n_batches: # check if no intermediate loss/acc stats was shown
            print('No training statistics as batches ({}) < show_every_n_batches ({})'.format(len(tr_batches), show_every_n_batches))
            stats_file.write('No training statistics as batches ({}) < show_every_n_batches ({})\n'.format(len(tr_batches), show_every_n_batches))
        else:
            print('Training    {:.3f}  |     {:>6.2%} (show_every_n_batches)'.format(sum(tr_epoch_loss) / len(tr_epoch_loss), sum(tr_epoch_acc) / len(tr_epoch_acc)))
            stats_file.write('Training    {:.3f}  |     {:>6.2%} (show_every_n_batches)\n'.format(sum(tr_epoch_loss) / len(tr_epoch_loss), sum(tr_epoch_acc) / len(tr_epoch_acc)))
        print('Validation  {:.3f}  |     {:>6.2%} (all validation batches)'.format(sum(va_loss_array) / len(va_loss_array), sum(va_acc_array) / len(va_acc_array)))
        stats_file.write('Validation  {:.3f}  |     {:>6.2%} (all validation batches)\n'.format(sum(va_loss_array) / len(va_loss_array), sum(va_acc_array) / len(va_acc_array)))
        print('Testing     {:.3f}  |     {:>6.2%} (all test batches)'.format(sum(te_loss_array) / len(te_loss_array), sum(te_acc_array) / len(te_acc_array)))
        stats_file.write('Testing     {:.3f}  |     {:>6.2%} (all test batches)\n\n'.format(sum(te_loss_array) / len(te_loss_array), sum(te_acc_array) / len(te_acc_array)))
        stats_file.close()
        
        # Save Model
        location = model_path_dir + name + '-' + time.strftime("%y%m%d%H%M") + '.ckpt'
        saved_location = saver.save(sess, location, global_step=epoch_i + 1)
        print('Model saved ({})\n'.format(saved_location))

EPOCH       BATCH       TR LOSS  |  TR ACC  |  VA LOSS  |  VA ACC         TRAINING STATUS
 1/50       0/15677       1.686  |  19.94%  |    2.067  |   0.05%          0 m   -: - (0.00%)
 1/50    3000/15677       0.089  |  68.49%  |    0.423  |  15.49%         13 m  58:33 (0.38%)
 1/50    6000/15677       0.101  |  69.86%  |    0.375  |  11.74%         26 m  58:13 (0.77%)
 1/50    9000/15677       0.081  |  67.96%  |    0.361  |  17.16%         40 m  57:56 (1.15%)
 1/50   12000/15677       0.087  |  72.80%  |    0.326  |  18.98%         53 m  57:42 (1.53%)
 1/50   15000/15677       0.115  |  69.14%  |    0.268  |  45.46%         67 m  57:29 (1.91%)

Epoch  1     Loss  |   Accuracy
Training    0.360  |     61.36% (show_every_n_batches)
Validation  0.146  |     71.42% (all validation batches)
Testing     0.146  |     71.42% (all test batches)
Model saved (models/1111-recsys-170522/recsys-1705221537.ckpt-1)

 2/50    2323/15677       0.081  |  75.58%  |    0.246  |  45.71%         81 m  57:5