In [1]:
import numpy as np
from time import time
from sklearn.metrics import accuracy_score,roc_auc_score,mean_squared_error
import tensorflow as tf
import pickle
import sys

def load_file(file):
    users = []
    pois = []
    scores = []
    historic_pois = []
    reviews = []
    for line in file:
        record = line.split("\t")
        user_id = int(record[0])
        poi_id = int(record[1])
        score = float(record[2])
        last_k = [int(s) for s in record[4].split(',')]
        review = record[5]
        users.append(user_id)
        pois.append(poi_id)
        scores.append(score)
        historic_pois.append(last_k)
        reviews.append(review)
    return users,pois,scores,historic_pois,reviews

# Dataset stats
FIRST_K = 10
print("num features = {}".format(FIRST_K))
PREFIX = "tcenr_data/141/"#"414_amazon_"#"414_t25_rest_b_"#"1655_t10_rest_"
INPUT_PREFIX = PREFIX + str(FIRST_K) + "_"
DIR = "yelp/" + INPUT_PREFIX#"amazon_dataset/" + INPUT_PREFIX #"yelp_dataset/" + PREFIX
WEIGHTS_DIR = "result/" + PREFIX
#NUM_USERS = 55160#3471#38039#7987#6869  
#NUM_ITEMS = 67144#3996#50513#6153  
#TRAIN_SIZE = 6239812#694836#4157256#1089484 
#TEST_SIZE = 220640#13884#152156#27476 
NUM_USERS = 2320#14346 
NUM_ITEMS = 2583#34389
#TRAIN_SIZE = 1115292#885756
TEST_SIZE = 11600#57384 

sys.argv=['pretraining2.py',32,32]
    
# Load training data
with open(DIR + "train.txt") as train_file:
    train_users, train_pois, train_scores, train_k_pois, train_reviews  = load_file(train_file)
    
TRAIN_SIZE = len(train_users)
print("Train size = {}".format(TRAIN_SIZE))
# Set training dataset
train_set = {'users': train_users, 'pois': train_pois, 'scores': train_scores, 'k_pois': train_k_pois, 'reviews': train_reviews}

# Load test data
with open(DIR + "valid.txt") as test_file:
    test_users, test_pois, test_scores, test_k_pois, test_reviews = load_file(test_file)
# Set test dataset
test_set = {'users': test_users, 'pois': test_pois, 'scores': test_scores, 'k_pois': test_k_pois, 'reviews': test_reviews}

# Check training data
assert len(train_users)==TRAIN_SIZE, "mismatch in training set size {}!={}".format(len(train_users),TRAIN_SIZE)
assert len(train_k_pois[0])==FIRST_K==len(test_k_pois[0]), "mismatch in number of positive items {}!={}!={}".format(len(train_k_pois[0]),FIRST_K,len(test_k_pois[0]))
assert max(train_users)==(NUM_USERS-1)==max(test_users), "mimatch in number of users {}!={}!={}".format(max(train_users),(NUM_USERS-1),max(test_users))
assert max(train_pois)==(NUM_ITEMS-1)==max(test_pois), "mismatch in number of item {}!={}!={}".format(max(train_pois),(NUM_ITEMS-1),max(test_pois))
assert len(test_users)==TEST_SIZE, "mismatch in test set size {}!={}".format(len(test_users),TEST_SIZE)

num features = 10
Train size = 542825


In [2]:
# Location embedding sizes
if len(sys.argv)>1:
    ITEM_EMBEDDING_SIZE = sys.argv[1]
else:
    ITEM_EMBEDDING_SIZE = 32#128

# User embedding sizes
if len(sys.argv)>2:
    USER_EMBEDDING_SIZE = sys.argv[2]
else:
    USER_EMBEDDING_SIZE = 32

print("item and user embedding sizes: {}, {}".format(ITEM_EMBEDDING_SIZE,USER_EMBEDDING_SIZE))
HIDDEN_LAYERS = [64,32]

# Training parameters
LR = 0.05
EPOCHS = 150
EARLY_STOP_INTERVAL = 20
MIN_EPOCH_TO_SAVE = 10
ROC_DIFF_TO_SAVE = 1.002
BATCH_SIZE = 8192

item and user embedding sizes: 32, 32


In [3]:
tf.reset_default_graph()

In [4]:
import math

in_item = tf.placeholder(tf.int32,[None],'in_item')
in_user = tf.placeholder(tf.int32,[None],'in_user')
in_ratings = tf.placeholder(tf.float32,[None],'in_ratings')

init_value = 0.1

emb_user_layer = tf.Variable(tf.truncated_normal([NUM_USERS, USER_EMBEDDING_SIZE], 
                                                 stddev=init_value/math.sqrt(float(USER_EMBEDDING_SIZE)), mean=0),
                       name = 'user_embedding', dtype=tf.float32)
emb_item_layer = tf.Variable(tf.truncated_normal([NUM_ITEMS, ITEM_EMBEDDING_SIZE], 
                                           stddev=init_value/math.sqrt(float(ITEM_EMBEDDING_SIZE)), mean=0), 
                       name = 'item_embedding', dtype=tf.float32)
emb_user = tf.nn.embedding_lookup(emb_user_layer, in_user, name = 'target_user_emb') 
emb_item = tf.nn.embedding_lookup(emb_item_layer, in_item, name = 'candidate_item_emb')   
        
hidden_layers = [tf.concat([emb_user,emb_item],1)]

model_params = [emb_user_layer,emb_item_layer]  

for i in range(1,len(HIDDEN_LAYERS)):
    w_hidden_layer = tf.Variable(tf.truncated_normal([HIDDEN_LAYERS[i-1],HIDDEN_LAYERS[i]], stddev = init_value, mean = 0), 
                                 name = 'w_hidden_'+ str(i), dtype=tf.float32) 
    b_hidden_layer = tf.Variable(tf.truncated_normal([HIDDEN_LAYERS[i]], stddev = init_value*0.1, mean = 0), 
                                 name = 'b_hidden_'+ str(i), dtype=tf.float32)
    cur_layer = tf.nn.xw_plus_b(hidden_layers[i-1], w_hidden_layer, b_hidden_layer)
    cur_layer = tf.nn.relu(cur_layer)
    hidden_layers.append(cur_layer)
    model_params.append(w_hidden_layer)
    model_params.append(b_hidden_layer)     

In [5]:
w_output = tf.Variable(tf.truncated_normal([HIDDEN_LAYERS[-1], 1], stddev=init_value, mean=0), name='w_output', dtype=tf.float32)
b_output =  tf.Variable(tf.truncated_normal([1], stddev=init_value*0.01, mean=0), name='b_output', dtype=tf.float32)
model_params.append(w_output)
model_params.append(b_output)
raw_predictions = tf.nn.xw_plus_b(cur_layer, w_output, b_output, name='output')

predictions = tf.reshape(tf.sigmoid(raw_predictions), [-1]) 
raw_error = tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(raw_predictions, [-1]), 
                                                    labels=tf.reshape(in_ratings, [-1]))
error = tf.reduce_mean(raw_error,name='cross_entropy_loss')
loss = error
train_step = tf.train.AdamOptimizer(LR).minimize(loss, var_list=model_params)

In [6]:
def get_feed_dict(dataset,start,end):
    feed_dict = {in_item : dataset['pois'][start:end], in_user : dataset['users'][start:end],
                 in_ratings : dataset['scores'][start:end]}
    return feed_dict

def evaluate_model(dataset,set_size,batch_size):
    all_ratings = []
    all_predictions = []
    for i in range(0,set_size,batch_size):
        curr_ratings,curr_predictions = sess.run([in_ratings, predictions],get_feed_dict(dataset,i,i+batch_size))
        all_ratings = all_ratings + curr_ratings
        all_predictions = all_predictions + curr_predictions
    return all_ratings,all_predictions

In [7]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
best_roc_auc=0.001
best_roc_epoch=MIN_EPOCH_TO_SAVE
saver = tf.train.Saver({"item_embedding":emb_item_layer, "user_embedding":emb_user_layer})

#TRAIN_SIZE = 256100
#TEST_SIZE = 6764

test_ratings, test_output,users,items = sess.run([in_ratings, predictions,in_user,in_item],get_feed_dict(test_set,0,TEST_SIZE))
test_accuracy = accuracy_score(y_true = test_ratings, y_pred=test_output.round())
test_mse = mean_squared_error(y_true=test_ratings, y_pred=test_output)
test_roc_auc = roc_auc_score(y_true=test_ratings, y_score=test_output)
print("Test accuracy before training :{:.4f} mse:{:.4f} roc:{:.4f}".format(test_accuracy,test_mse,test_roc_auc))

for epoch in range(EPOCHS):
    t1 = time()
    # Shuffle the training batch order
    training_indexes = list(range(0, TRAIN_SIZE, BATCH_SIZE))
    total_batches = len(training_indexes)
    #np.random.shuffle(training_indexes)
    train_ratings=np.zeros(TRAIN_SIZE)
    train_outputs=np.zeros(TRAIN_SIZE)
    train_losses=np.zeros(total_batches)
    # Train the model for each batch size
    curr_iter=0
    for start in training_indexes:
        end = min(start + BATCH_SIZE,TRAIN_SIZE)
        feed_dict = get_feed_dict(train_set,start,end)
        # Perform a training step for current batch
        _,curr_loss,curr_ratings, curr_output = sess.run([train_step,loss,in_ratings,predictions],feed_dict)
        train_ratings[start:end] = curr_ratings
        train_outputs[start:end] = curr_output
        train_losses[curr_iter] = curr_loss
        curr_iter+=1
        
    print("epoch {} took {} ms. avg loss {:.4f}".format(epoch,time()-t1,np.average(train_losses)))
    train_accuracy = accuracy_score(y_true = train_ratings, y_pred=train_outputs.round())
    train_mse = mean_squared_error(y_true=train_ratings, y_pred=train_outputs)
    train_roc_auc = roc_auc_score(y_true=train_ratings, y_score=train_outputs)
    print("Train accuracy:{:.4f} mse:{:.4f} roc:{:.4f}".format(train_accuracy,train_mse,train_roc_auc))

    test_ratings, test_output,users,items = sess.run([in_ratings, predictions,in_user,in_item],get_feed_dict(test_set,0,TEST_SIZE))
    test_accuracy = accuracy_score(y_true = test_ratings, y_pred=test_output.round())
    test_mse = mean_squared_error(y_true=test_ratings, y_pred=test_output)
    test_roc_auc = roc_auc_score(y_true=test_ratings, y_score=test_output)
    print("Test accuracy:{:.4f} mse:{:.4f} roc:{:.4f}".format(test_accuracy,test_mse,test_roc_auc))
    
    if (test_roc_auc/best_roc_auc)>ROC_DIFF_TO_SAVE and epoch>MIN_EPOCH_TO_SAVE:
        save_path = saver.save(sess, WEIGHTS_DIR + "pre_trained" + str(ITEM_EMBEDDING_SIZE) + "_" + str(USER_EMBEDDING_SIZE) +".ckpt")
        print("ROC improved from {:.4f} to {:.4f}. Model savedd to {}".format(best_roc_auc,test_roc_auc,save_path))
        best_roc_auc = test_roc_auc
        best_roc_epoch = epoch
    
    if (epoch - best_roc_epoch)>EARLY_STOP_INTERVAL:
        print("Early stop due to no imporvement since epoch {}".format(best_roc_epoch))
        break
    
sess.close()

Test accuracy before training :0.4412 mse:0.2501 roc:0.4973
epoch 0 took 0.41890573501586914 ms. avg loss 0.4905
Train accuracy:0.7954 mse:0.1572 roc:0.6214
Test accuracy:0.8000 mse:0.1651 roc:0.5475
epoch 1 took 0.37436723709106445 ms. avg loss 0.4760
Train accuracy:0.7975 mse:0.1520 roc:0.6690
Test accuracy:0.7984 mse:0.1549 roc:0.6477
epoch 2 took 0.32744693756103516 ms. avg loss 0.4596
Train accuracy:0.7990 mse:0.1474 roc:0.7081
Test accuracy:0.7961 mse:0.1469 roc:0.7337
epoch 3 took 0.3484196662902832 ms. avg loss 0.4522
Train accuracy:0.7996 mse:0.1442 roc:0.7415
Test accuracy:0.8033 mse:0.1387 roc:0.7816
epoch 4 took 0.326401948928833 ms. avg loss 0.4104
Train accuracy:0.8067 mse:0.1330 roc:0.7959
Test accuracy:0.8059 mse:0.1344 roc:0.8002
epoch 5 took 0.2804605960845947 ms. avg loss 0.3982
Train accuracy:0.8104 mse:0.1295 roc:0.8117
Test accuracy:0.8244 mse:0.1212 roc:0.8478
epoch 6 took 0.2973921298980713 ms. avg loss 0.3675
Train accuracy:0.8158 mse:0.1213 roc:0.8429
Test acc

In [None]:
import numpy as np
from time import time
from sklearn.metrics import accuracy_score,roc_auc_score,mean_squared_error
import tensorflow as tf
import pickle

def load_file(file):
    users = []
    pois = []
    scores = []
    historic_pois = []
    reviews = []
    for line in file:
        record = line.split("\t")
        user_id = int(record[0])
        poi_id = int(record[1])
        score = float(record[2])
        last_k = [int(s) for s in record[4].split(',')]
        review = record[5]
        users.append(user_id)
        pois.append(poi_id)
        scores.append(score)
        historic_pois.append(last_k)
        reviews.append(review)
    return users,pois,scores,historic_pois,reviews

# Dataset stats
PREFIX = "414_t25_rest_b_"#"1655_t10_rest_"
DIR = "amazon_dataset/" + PREFIX #"yelp_dataset/" + PREFIX
WEIGHTS_DIR = "result/" + PREFIX
#NUM_USERS = 42475#3471#38039#7987#6869  
#NUM_ITEMS = 42099#3996#50513#6153  
#TRAIN_SIZE = 4771688#694836#4157256#1089484 
#TEST_SIZE = 169900#13884#152156#27476 
NUM_USERS = 14346 
NUM_ITEMS = 34389
TRAIN_SIZE = 885756
TEST_SIZE = 57384 
FIRST_K = 8
NUM_FEATURES = 25#661#133
FEATURES_PER_POI = 25#25
FEATURE_TYPE = 'topics'

# Load poi-categories dictionary
with open(DIR + 'poi_' + FEATURE_TYPE + '.pkl', 'rb') as f:
    poi_categories = pickle.load(f)

# Flatten all categoires into one dimensional set
categories_list = set()
for categories in poi_categories.values():
    for category in categories:
        categories_list.add(category)
    
# Load training data
with open(DIR + "train.txt") as train_file:
    train_users, train_pois, train_scores, train_k_pois, train_reviews  = load_file(train_file)
# Load categories of training pois
train_features = []
for poi in train_pois:
    train_features.append(poi_categories[poi])
# Set training dataset
train_set = {'users': train_users, 'pois': train_pois, 'scores': train_scores, 'k_pois': train_k_pois, 'reviews': train_reviews, 
             'features': train_features}

# Load test data
with open(DIR + "valid.txt") as test_file:
    test_users, test_pois, test_scores, test_k_pois, test_reviews = load_file(test_file)
# Load categories of test pois
test_features = []
for poi in test_pois:
    test_features.append(poi_categories[poi])
# Set test dataset
test_set = {'users': test_users, 'pois': test_pois, 'scores': test_scores, 'k_pois': test_k_pois, 'reviews': test_reviews, 
            'features': test_features}

# Check training data
assert len(train_users)==TRAIN_SIZE, "mismatch in training set size {}!={}".format(len(train_users),TRAIN_SIZE)
assert len(train_k_pois[0])==FIRST_K==len(test_k_pois[0]), "mismatch in number of positive items {}!={}!={}".format(len(train_k_pois[0]),FIRST_K,len(test_k_pois[0]))
assert max(train_users)==(NUM_USERS-1)==max(test_users), "mimatch in number of users {}!={}!={}".format(max(train_users),(NUM_USERS-1),max(test_users))
assert max(train_pois)==(NUM_ITEMS-1)==max(test_pois), "mismatch in number of item {}!={}!={}".format(max(train_pois),(NUM_ITEMS-1),max(test_pois))
#assert len(categories_list)+1==NUM_FEATURES, "mismatch in number of features {}!={}".format(len(categories_list)+1,NUM_FEATURES)
#assert len(train_features[0])==FEATURES_PER_POI, "mismatch in number of features per location {}!={}".format(len(train_features[0]),FEATURES_PER_POI)
assert len(test_users)==TEST_SIZE, "mismatch in test set size {}!={}".format(len(test_users),TEST_SIZE)