In [1]:
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [2]:
# reading the JSON data using json.load()
file = 'foody_data_foody_review_data.json'
with open(file) as train_file:
    raw_data = json.load(train_file)

In [3]:
ratings_list = []
count_num_of_store = 0
for store in raw_data:
    count_num_of_store += 1
    for review in store:
        ratings_list.append([review['name'], count_num_of_store, review['rating']])        

In [4]:
df = pd.DataFrame(ratings_list, columns = ['user', 'item', 'rate'])

In [5]:
df.apply(lambda x: sum(x.isnull()),axis=0)

user    0
item    0
rate    0
dtype: int64

In [6]:
le = LabelEncoder()
df['user'] = le.fit_transform(df['user'])
df['rate'] = df['rate'].astype('float64')
df['rate'].fillna(df['rate'].mean(), inplace=True)
df.dtypes

user      int64
item      int64
rate    float64
dtype: object

In [7]:
#load data
msk = np.random.rand(len(df)) < 0.7
print(msk)
df_train = df[msk]
user_indecies = [x for x in df_train.user.values]
item_indecies = [x for x in df_train.item.values]
rates = df_train.rate.values

[ True False False ...  True False  True]


In [9]:
#variables
feature_len = 10
U = tf.Variable(initial_value=tf.truncated_normal([2332,feature_len]), name='users')
P = tf.Variable(initial_value=tf.truncated_normal([feature_len,103]), name='items')

result = tf.matmul(U, P)

result_flatten = tf.reshape(result, [-1])
R = tf.gather(result_flatten, user_indecies * tf.shape(result)[1] + 
              item_indecies, name='extracting_user_rate')


#cost fucntion
diff_op = tf.subtract(R, rates, name='trainig_diff')
diff_op_squared = tf.abs(diff_op, name="squared_difference")
base_cost = tf.reduce_sum(diff_op_squared, name="sum_squared_error")


# regularization
lda = tf.constant(.001, name='lambda')
norm_sums = tf.add(tf.reduce_sum(tf.abs(U, name='user_abs'), name='user_norm'), 
   tf.reduce_sum(tf.abs(P, name='item_abs'), name='item_norm'))
regularizer = tf.multiply(norm_sums, lda, 'regularizer')

cost = tf.add(base_cost, regularizer)


#optimizer
lr = tf.constant(.001, name='learning_rate')
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(lr, global_step, 10000, 0.96, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_step = optimizer.minimize(cost, global_step=global_step)


#execution
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
for i in range(50000):
    sess.run(training_step)  

In [23]:
R_test = tf.gather(result_flatten, user_indecies * tf.shape(result)[1] + item_indecies)
diff_op_test = tf.subtract(R_test, rates)
diff_op_abs_test = tf.abs(diff_op_test)

error = tf.div(tf.reduce_sum(diff_op_abs_test), df_test.shape[0])

print (sess.run(error))

0.15597712


In [18]:
#testing
u, p, r = df[['user', 'item', 'rate']].values[1]
print(u, p, r)
rhat = tf.gather(result_flatten ,int(u) * tf.shape(result)[1] + int(p))
print ("rating for user " + str(u) + " for item " + str(p) + 
      " is " + str(r) + " and our prediction is: " + str(sess.run(rhat)))

1537.0 1.0 1.0
rating for user 1537.0 for item 1.0 is 1.0 and our prediction is: -3.5079765


In [24]:
#accuracy
df_test = df[~msk]
user_indecies_test = [x for x in df_test.user.values]
item_indecies_test = [x for x in df_test.item.values]
rates_test = df_test.rate.values
R_test = tf.gather(result_flatten, user_indecies_test * tf.shape(result)[1] + item_indecies_test, name='extracting_user_rate_test')
diff_op_test = tf.subtract(R_test, rates_test, name='test_diff')
diff_op_abs_test = tf.abs(diff_op_test, name="abs_difference_test")

error = tf.div(tf.reduce_sum(diff_op_abs_test), df_test.shape[0], name="average_error")

print (sess.run(error))

6.56669
