# Setups

## Imports

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import json
import pickle
import itertools
import csv
import datetime
import time
import sys

# our modules
sys.path.insert(0, './models')
from loaddata import loader
from preprocess_utils import process
from NARRE import NARRE
from NARRE_DNN import NARRE_dnn
from NARRE_Attention import Multi_NARRE

from train import train_step, dev_step, write_summary
%load_ext tensorboard

## Constants

In [2]:
dataset = 'kindle'
data_path = './data'
TPS_DIR = os.path.join(data_path, dataset)
TP_file = 'Kindle_Store_5.json'

# Load Data

In [3]:
data_loader = loader(TPS_DIR, TP_file)
data_loader.load()

Data Loading Started.
Data Loader Finished...
Files Saved at ./data/kindle


# Preprocess

In [3]:
# set paths to dataset 
train_data = os.path.join(TPS_DIR, 'train.csv')
valid_data = os.path.join(TPS_DIR, 'valid.csv')
test_data = os.path.join(TPS_DIR, 'test.csv')
user_review = os.path.join(TPS_DIR, 'user_review')
item_review = os.path.join(TPS_DIR, 'item_review')
user_review_id = os.path.join(TPS_DIR, 'user_rid')
item_review_id = os.path.join(TPS_DIR, 'item_rid')
stopwords = None

In [4]:
# load all elements from data
u_text, i_text, y_train, y_valid, vocabulary_user, vocabulary_inv_user, vocabulary_item, \
vocabulary_inv_item, uid_train, iid_train, uid_valid, iid_valid, user_num, item_num, reid_user_train, reid_item_train, reid_user_valid, reid_item_valid = \
    process(train_data, valid_data, user_review, item_review, user_review_id,
              item_review_id, stopwords)

valid
len
u_len: 24
i_len: 39
u2_len: 234
i2_len: 234
user_num: 139816
item_num: 98823
load data done
pad user done
pad item done
235921
259763


In [5]:
# split data to train/validation and create batches
np.random.seed(2020)
shuffle_indices = np.random.permutation(np.arange(len(y_train)))

userid_train = uid_train[shuffle_indices]
itemid_train = iid_train[shuffle_indices]
y_train = y_train[shuffle_indices]
reid_user_train = reid_user_train[shuffle_indices]
reid_item_train = reid_item_train[shuffle_indices]

y_train = y_train[:, np.newaxis]
y_valid = y_valid[:, np.newaxis]

userid_train = userid_train[:, np.newaxis]
itemid_train = itemid_train[:, np.newaxis]
userid_valid = uid_valid[:, np.newaxis]
itemid_valid = iid_valid[:, np.newaxis]

batches_train = list(
    zip(userid_train, itemid_train, reid_user_train, reid_item_train, y_train))
batches_test = list(zip(userid_valid, itemid_valid, reid_user_valid, reid_item_valid, y_valid))

In [6]:
# save files for train / test
print 'write begin'
output = open(os.path.join(TPS_DIR, dataset + '.train'), 'wb')
pickle.dump(batches_train, output)
output.close()

output = open(os.path.join(TPS_DIR, dataset + '.test'), 'wb')
pickle.dump(batches_test, output)
output.close()

write begin


In [7]:
# save a dictionary of all parameters
para = {}
para['user_num'] = user_num
para['item_num'] = item_num
para['review_num_u'] = u_text[0].shape[0]
para['review_num_i'] = i_text[0].shape[0]
para['review_len_u'] = u_text[1].shape[1]
para['review_len_i'] = i_text[1].shape[1]
para['user_vocab'] = vocabulary_user
para['item_vocab'] = vocabulary_item
para['train_length'] = len(y_train)
para['test_length'] = len(y_valid)
para['u_text'] = u_text
para['i_text'] = i_text
output = open(os.path.join(TPS_DIR, dataset + '.para'), 'wb')
pickle.dump(para, output)
output.close()

# Model

## Settings and Hyperparameters

In [3]:
# define paths
word2vec = os.path.join(data_path, 'google.bin')
train_data = os.path.join(TPS_DIR, dataset + ".train")
valid_data = os.path.join(TPS_DIR, dataset + ".test")
para_data = os.path.join(TPS_DIR, dataset + ".para")

# create log file
logs_path = "./logs/" +  datetime.datetime.now().strftime('run-%Y-%m-%d-%H-%M')
os.makedirs(logs_path)

# Model Hyperparameters
embedding_dim = 300
filter_sizes = "3"
num_filters = 100
dropout_keep_prob = 0.3
l2_reg_lambda = 0.001
embedding_id = 32
latent_size = 32
attention_size = 32

# Training parameters
batch_size = 100
num_epochs = 1

# Misc Parameters
allow_soft_placement = True
log_device_placement = False
allow_growth = True


# Loading Data
print("Loading data...")
pkl_file = open(para_data, 'rb')
para = pickle.load(pkl_file)
user_num = para['user_num']
item_num = para['item_num']
review_num_u = para['review_num_u']
review_num_i = para['review_num_i']
review_len_u = para['review_len_u']
review_len_i = para['review_len_i']
vocabulary_user = para['user_vocab']
vocabulary_item = para['item_vocab']
train_length = para['train_length']
test_length = para['test_length']
u_text = para['u_text']
i_text = para['i_text']
pkl_file.close()
np.random.seed(2017)
random_seed = 2017

print 'Num. of Users: ', user_num
print 'Num. of Items: ', item_num
print 'Num. of User Reviews: ', review_num_u
print 'Length of User Reviews: ', review_len_u
print 'Num. of Item Reviews: ', review_num_i
print 'Length of Item Reviews: ', review_len_i

Loading data...
Num. of Users:  139816
Num. of Items:  98823
Num. of User Reviews:  24
Length of User Reviews:  234
Num. of Item Reviews:  39
Length of Item Reviews:  234


## Run

In [4]:
narre_version = 'attention'
narre_params = {}


if narre_version == 'original':
    narre = NARRE
if narre_version == 'dnn':
    narre = NARRE_dnn
if narre_version == 'attention':
    narre = Multi_NARRE

In [5]:
with tf.Graph().as_default():

    session_conf = tf.ConfigProto(
        allow_soft_placement=allow_soft_placement,
        log_device_placement=log_device_placement)
    session_conf.gpu_options.allow_growth = True
    sess = tf.Session(config=session_conf)
    with sess.as_default():

        deep = narre(
            review_num_u=review_num_u,
            review_num_i=review_num_i,
            review_len_u=review_len_u,
            review_len_i=review_len_i,
            user_num=user_num,
            item_num=item_num,
            num_classes=1,
            user_vocab_size=len(vocabulary_user),
            item_vocab_size=len(vocabulary_item),
            embedding_size=embedding_dim,
            embedding_id=embedding_id,
            filter_sizes=list(map(int, filter_sizes.split(","))),
            num_filters=num_filters,
            l2_reg_lambda=l2_reg_lambda,
            attention_size=attention_size,
            n_latent=latent_size,
            **narre_params)

        writer = tf.summary.FileWriter(logs_path, sess.graph)
        tf.set_random_seed(random_seed)
        print user_num
        print item_num
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # optimizer = tf.train.AdagradOptimizer(learning_rate=0.01, initial_accumulator_value=1e-8).minimize(deep.loss)
        optimizer = tf.train.AdamOptimizer(0.002, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(deep.loss, global_step=global_step)

        train_op = optimizer  # .apply_gradients(grads_and_vars, global_step=global_step)


        sess.run(tf.initialize_all_variables())

        saver = tf.train.Saver()

        if word2vec:
            # initial matrix with random uniform
            u = 0
            initW = np.random.uniform(-1.0, 1.0, (len(vocabulary_user), embedding_dim))
            # load any vectors from the word2vec
            print("Load word2vec u file {}\n".format(word2vec))
            with open(word2vec, "rb") as f:
                header = f.readline()
                vocab_size, layer1_size = map(int, header.split())
                binary_len = np.dtype('float32').itemsize * layer1_size
                for line in xrange(vocab_size):
                    word = []
                    while True:
                        ch = f.read(1)
                        if ch == ' ':
                            word = ''.join(word)
                            break
                        if ch != '\n':
                            word.append(ch)
                    idx = 0

                    if word in vocabulary_user:
                        u = u + 1
                        idx = vocabulary_user[word]
                        initW[idx] = np.fromstring(f.read(binary_len), dtype='float32')
                    else:
                        f.read(binary_len)
            sess.run(deep.W1.assign(initW))
            initW = np.random.uniform(-1.0, 1.0, (len(vocabulary_item), embedding_dim))
            # load any vectors from the word2vec
            print("Load word2vec i file {}\n".format(word2vec))

            item = 0
            with open(word2vec, "rb") as f:
                header = f.readline()
                vocab_size, layer1_size = map(int, header.split())
                binary_len = np.dtype('float32').itemsize * layer1_size
                for line in xrange(vocab_size):
                    word = []
                    while True:
                        ch = f.read(1)
                        if ch == ' ':
                            word = ''.join(word)
                            break
                        if ch != '\n':
                            word.append(ch)
                    idx = 0
                    if word in vocabulary_item:
                        item = item + 1
                        idx = vocabulary_item[word]
                        initW[idx] = np.fromstring(f.read(binary_len), dtype='float32')
                    else:
                        f.read(binary_len)

            sess.run(deep.W2.assign(initW))
            print item

        best_mae = 5
        best_rmse = 5
        train_mae = 0
        train_rmse = 0
        sum_tloss = 0
        early_stop = 1
        stopping_step = 0
        should_stop = False

        pkl_file = open(train_data, 'rb')

        train_data = pickle.load(pkl_file)

        train_data = np.array(train_data)
        pkl_file.close()

        pkl_file = open(valid_data, 'rb')

        test_data = pickle.load(pkl_file)
        test_data = np.array(test_data)
        pkl_file.close()

        data_size_train = len(train_data)
        data_size_test = len(test_data)
        batch_size = batch_size
        ll = int(len(train_data) / batch_size)
        print 'Start training'
        start = time.time()
        for epoch in range(num_epochs):
            if not should_stop:
                # Shuffle the data at each epoch
                shuffle_indices = np.random.permutation(np.arange(data_size_train))
                shuffled_data = train_data[shuffle_indices]
                for batch_num in range(ll):

                    start_index = batch_num * batch_size
                    end_index = min((batch_num + 1) * batch_size, data_size_train)
                    data_train = shuffled_data[start_index:end_index]

                    uid, iid, reuid, reiid, y_batch = zip(*data_train)
                    u_batch = []
                    i_batch = []
                    for i in range(len(uid)):
                        u_batch.append(u_text[uid[i][0]])
                        i_batch.append(i_text[iid[i][0]])
                    u_batch = np.array(u_batch)
                    i_batch = np.array(i_batch)

                    t_loss, t_rmse, t_mae, u_a, i_a = train_step(sess, deep, u_batch, i_batch, uid, iid, reuid, reiid, y_batch,batch_num, dropout_keep_prob, train_op, global_step)
                    current_step = tf.train.global_step(sess, global_step)
                    sum_tloss += t_loss
                    train_rmse += t_rmse
                    train_mae += t_mae
                    if epoch == 0 and batch_num == 0:
                        print ('Finished first batch')
                    if batch_num % 500 == 0 and batch_num > 1:
                        print("\nEvaluation:")
                        print batch_num

                        loss_s = 0
                        accuracy_s = 0
                        mae_s = 0

                        ll_test = int(len(test_data) / batch_size) + 1
                        for batch_num in range(ll_test):
                            start_index = batch_num * batch_size
                            end_index = min((batch_num + 1) * batch_size, data_size_test)
                            data_test = test_data[start_index:end_index]

                            userid_valid, itemid_valid, reuid, reiid, y_valid = zip(*data_test)
                            u_valid = []
                            i_valid = []
                            for i in range(len(userid_valid)):
                                u_valid.append(u_text[userid_valid[i][0]])
                                i_valid.append(i_text[itemid_valid[i][0]])
                            u_valid = np.array(u_valid)
                            i_valid = np.array(i_valid)

                            loss, accuracy, mae, _ = dev_step(sess, deep, u_valid, i_valid, userid_valid, itemid_valid, reuid, reiid,
                                                           y_valid, global_step)
                            loss_s = loss_s + len(u_valid) * loss
                            accuracy_s = accuracy_s + len(u_valid) * np.square(accuracy)
                            mae_s = mae_s + len(u_valid) * mae

                        print ("loss_valid {:g}, rmse_valid {:g}, mae_valid {:g}".format(loss_s / test_length,
                                                                                         np.sqrt(
                                                                                             accuracy_s / test_length),
                                                                                         mae_s / test_length))

                        rmse = np.sqrt(accuracy_s / test_length)
                        mae = mae_s / test_length

                        print("")
                        write_summary(loss_s / test_length, "validation/loss", writer, current_step)
                        write_summary(rmse, "validation/rmse", writer, current_step)
                        write_summary(mae, "validation/mae", writer, current_step)

                print str(epoch) + ':\n'
                print("\nEvaluation:")
                print "train:rmse,mae:", train_rmse / ll, train_mae / ll
                u_a = np.reshape(u_a[0], (1, -1))
                i_a = np.reshape(i_a[0], (1, -1))


                write_summary(sum_tloss / ll, "train/loss", writer, current_step)
                write_summary(train_rmse / ll, "train/rmse", writer, current_step)
                write_summary(train_mae / ll, "train/mae", writer, current_step)

                print u_a
                print i_a
                train_rmse = 0
                train_mae = 0
                sum_tloss = 0



            loss_s = 0
            accuracy_s = 0
            mae_s = 0  

            ll_test = int(len(test_data) / batch_size) + 1
            for batch_num in range(ll_test):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size_test)
                data_test = test_data[start_index:end_index]

                userid_valid, itemid_valid, reuid, reiid, y_valid = zip(*data_test)
                u_valid = []
                i_valid = []
                for i in range(len(userid_valid)):
                    u_valid.append(u_text[userid_valid[i][0]])
                    i_valid.append(i_text[itemid_valid[i][0]])
                u_valid = np.array(u_valid)
                i_valid = np.array(i_valid)

                loss, accuracy, mae, preds = dev_step(sess, deep, u_valid, i_valid, userid_valid, itemid_valid, reuid, reiid, y_valid, global_step)
                loss_s = loss_s + len(u_valid) * loss
                accuracy_s = accuracy_s + len(u_valid) * np.square(accuracy)
                mae_s = mae_s + len(u_valid) * mae
            print ("loss_valid {:g}, rmse_valid {:g}, mae_valid {:g}".format(loss_s / test_length,
                                                                             np.sqrt(accuracy_s / test_length),
                                                                             mae_s / test_length))
            rmse = np.sqrt(accuracy_s / test_length)
            mae = mae_s / test_length

            if best_rmse > rmse:
                stopping_step = 0
                best_rmse = rmse
                best_preds = preds

            else:
                stopping_step += 1
                if stopping_step >= early_stop:
                    should_stop = True
                    print ("Early stopping is trigger at epoch: {} loss:{}".format(epoch,loss_s / test_length))

            if best_mae > mae:
                best_mae = mae
        print("")
        end = time.time()
        training_time = end-start
        filename = narre_version + '_' + dataset + '_' + datetime.datetime.now().strftime('run-%Y-%m-%d-%H-%M') + '.csv'
        results = {'model':narre_version, 'data':dataset, 'training time':training_time, 'rmse':best_rmse, 'mae':best_mae, 'preds':best_preds,
                  'embedding dim': embedding_dim, 'filter sizes': filter_sizes, 'num of filters': num_filters,
                   'dropout prob': dropout_keep_prob,'l2_reg': l2_reg_lambda, 'embedding_id': embedding_id,
                   'latent size': latent_size, 'attention_size': attention_size, 'batch size': batch_size, 'epochs': num_epochs}


        with open('./output/' + filename, 'w') as csv_file:
            writer = csv.writer(csv_file)
            for key, value in results.items():
                writer.writerow([key, value])




w1:  (235921, 300)
embedded_user:  Tensor("user_embedding/embedding_lookup/Identity:0", shape=(?, 24, 234, 300), dtype=float32)
embedded_users:  Tensor("user_embedding/ExpandDims:0", shape=(?, 24, 234, 300, 1), dtype=float32)
w2:  (259763, 300)
embedded_item:  Tensor("item_embedding/embedding_lookup/Identity:0", shape=(?, 39, 234, 300), dtype=float32)
embedded_items:  Tensor("item_embedding/ExpandDims:0", shape=(?, 39, 234, 300, 1), dtype=float32)


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
(?, 24, 100)
(100, 32)
100
Tensor("attention/Shape:0", shape=(2,), dtype=int32)
Tensor("attention/Shape_1:0", shape=(3,), dtype=int32)
32
Tensor("attention/transpose_1:0", shape=(?, 24, 1), dtype=float32)
(?, 32)
(?, 32)
(?, 32)

(?, 32)
(?, 32)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
139816
98823
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Load word2vec u file ./



Load word2vec i file ./data/google.bin





82259
Start training
Finished first batch

Evaluation:
500
loss_valid 289.664, rmse_valid 2.40697, mae_valid 2.28501



Evaluation:
1000
loss_valid 242.177, rmse_valid 2.20078, mae_valid 2.08434


Evaluation:
1500
loss_valid 178.284, rmse_valid 1.88822, mae_valid 1.77359


Evaluation:
2000
loss_valid 115.998, rmse_valid 1.52295, mae_valid 1.40684


Evaluation:
2500
loss_valid 91.2951, rmse_valid 1.35101, mae_valid 1.24215


Evaluation:
3000
loss_valid 69.4568, rmse_valid 1.17831, mae_valid 1.07114


Evaluation:
3500
loss_valid 55.7925, rmse_valid 1.05598, mae_valid 0.936568


Evaluation:
4000
loss_valid 47.7686, rmse_valid 0.977045, mae_valid 0.842042


Evaluation:
4500
loss_valid 44.8663, rmse_valid 0.94691, mae_valid 0.804681


Evaluation:
5000
loss_valid 44.2165, rmse_valid 0.940043, mae_valid 0.796324


Evaluation:
5500
loss_valid 41.7497, rmse_valid 0.913462, mae_valid 0.763047


Evaluation:
6000
loss_valid 40.5223, rmse_valid 0.899942, mae_valid 0.749331


Evaluation:
6500
loss_v

In [6]:
!python -m tensorboard.main --logdir=./logs/ --host=132.72.46.8

^C
Traceback (most recent call last):
  File "/home/nirku/.conda/envs/rs_project/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/nirku/.conda/envs/rs_project/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/nirku/.conda/envs/rs_project/lib/python2.7/site-packages/tensorboard/main.py", line 40, in <module>
    from tensorboard import default
  File "/home/nirku/.conda/envs/rs_project/lib/python2.7/site-packages/tensorboard/default.py", line 48, in <module>
    from tensorboard.plugins.interactive_inference import (
  File "/home/nirku/.conda/envs/rs_project/lib/python2.7/site-packages/tensorboard/plugins/interactive_inference/interactive_inference_plugin_loader.py", line 15, in <module>
    """Wrapper around plugin to conditionally enable it."""
KeyboardInterrupt
