In [38]:
import pandas as pd
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
dataset = pd.read_csv('data/ratings_Digital_Music.csv', names=header)

In [3]:
dataset.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,A2EFCYXHNK06IS,5555991584,5.0,978480000
1,A1WR23ER5HMAA9,5555991584,5.0,953424000
2,A2IR4Q0GPAFJKW,5555991584,4.0,1393545600
3,A2V0KUVAB9HSYO,5555991584,4.0,966124800
4,A1J0GL9HCA7ELW,5555991584,5.0,1007683200


In [4]:
df = dataset.head(10000)

In [5]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ( str(n_users) , ' users')
print ( str(n_items) , ' items')

8355  users
500  items


In [7]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)



In [9]:
ratings = df.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
ratings.head()

item_id,5555991584,6308051551,7901622466,B0000000ZW,B00000016T,B00000016W,B00000017R,B0000001BA,B0000001BO,B0000001O0,...,B0000011CU,B0000011GU,B0000011MD,B0000011N5,B0000011P7,B0000011WE,B0000011X5,B0000011XM,B00000127G,B0000012T3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A04345582HHNXEA9SAET4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A07465992K7FVSL53VA9B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A08161909WK3HU7UYTMW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A086594320Z777EHOFWKU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10175AMUHOQC4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
ratings.values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
sparsity = float(len(ratings.values.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity : ',100 - sparsity)

Sparsity :  99.7606223818073


In [28]:
train, test = cv.train_test_split(ratings.values, test_size=0.5)

In [29]:
print(train.shape)
print(test.shape)

(4177, 500)
(4178, 500)


In [31]:
train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
def slow_similarity(ratings, kind='user'):
    if kind == 'user':
        axmax = 0
        axmin = 1
    elif kind == 'item':
        axmax = 1
        axmin = 0
    sim = np.zeros((ratings.shape[axmax], ratings.shape[axmax]))
    for u in xrange(ratings.shape[axmax]):
        for uprime in range(ratings.shape[axmax]):
            rui_sqrd = 0.
            ruprimei_sqrd = 0.
            for i in range(ratings.shape[axmin]):
                sim[u, uprime] = ratings[u, i] * ratings[uprime, i]
                rui_sqrd += ratings[u, i] ** 2
                ruprimei_sqrd += ratings[uprime, i] ** 2
            sim[u, uprime] /= rui_sqrd * ruprimei_sqrd
    return sim

In [34]:
def fast_similarity(ratings, kind='user'):
    if kind == 'user':
        sim = ratings.dot(ratings.T)
    elif kind == 'item':
        sim = ratings.T.dot(ratings)
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return sim / norms / norms.T

In [35]:
fast_similarity(train, kind='user')

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [36]:
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')
print (item_similarity[:4, :4])

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


  import sys


In [37]:
item_prediction = predict_fast_simple(train, item_similarity, kind='item')
user_prediction = predict_fast_simple(train, user_similarity, kind='user')

NameError: name 'predict_fast_simple' is not defined

In [39]:
# Create a Tensorflow graph
graph = tf.Graph()

In [41]:
nb_users = n_users
nb_products = n_items
nb_factors = 500
max_rating = 5
#nb_rated_products = 500
top_k_products = 10


In [42]:
with graph.as_default():
    # User-item matrix
    user_item_matrix = tf.placeholder(tf.float32, shape=(nb_users, nb_products))
    
    # SVD
    St, Ut, Vt = tf.svd(user_item_matrix)
    
    # Compute reduced matrices
    Sk = tf.diag(St)[0:nb_factors, 0:nb_factors]
    Uk = Ut[:, 0:nb_factors]
    Vk = Vt[0:nb_factors, :]
    
    # Compute Su and Si
    Su = tf.matmul(Uk, tf.sqrt(Sk))
    Si = tf.matmul(tf.sqrt(Sk), Vk)
    
    # Compute user ratings
    ratings_t = tf.matmul(Su, Si)
    
    # Pick top k suggestions
    best_ratings_t, best_items_t = tf.nn.top_k(ratings_t, top_k_products)

In [43]:
# Create Tensorflow session
session = tf.InteractiveSession(graph=graph)

In [44]:
# Compute the top k suggestions for all users
feed_dict = {
    user_item_matrix: ratings.values
}

In [45]:
best_items = session.run([best_items_t], feed_dict=feed_dict)

In [46]:
best_items

[array([[426, 424, 416, ..., 200, 265, 394],
        [401, 400, 382, ..., 421,  66, 255],
        [488, 337,  42, ..., 204, 180, 377],
        ...,
        [455, 146,  76, ...,  26,  67,  64],
        [150, 256, 237, ..., 243, 177,  91],
        [394, 389, 312, ..., 457, 325,  66]], dtype=int32)]

In [47]:
# Suggestions for user 1000, 1010
for i in range(1000, 1010):
    print('User {}: {}'.format(i, best_items[0][i]))

User 1000: [384 390 344 391 246 437 393   4 370 307]
User 1001: [114  63 115  54 136 141 491  35 112 143]
User 1002: [210 206 208 172 177 142 137 213 143 149]
User 1003: [351 482 237 361 241 493 352 276 218 217]
User 1004: [463 461 470 469 467 473 378 439  45 438]
User 1005: [424 416 417 423 422   8 253 135 425  10]
User 1006: [395 253 201 206 263 222   4 252   6   9]
User 1007: [214 280 282 473 281 117 467  66 476 406]
User 1008: [458 478 102 183 391 431 424 194 191 328]
User 1009: [163 204  66 440 488 310  44 378 268 391]
