In [1]:
#import libararies

import numpy as np
import pandas as pd

In [2]:
items_df = pd.read_csv('data/itemdata.csv',usecols=['itemId','item'],dtype={'itemId': 'int32', 'item': 'str'})
rating_df = pd.read_csv('data/ratings.csv',usecols=['userId', 'item', 'rating'],
    dtype={'userId': 'str', 'item': 'str', 'rating':'float32'})

In [3]:
items_df.head()

Unnamed: 0,itemId,item
0,1,apple
1,2,banana
2,3,bread
3,4,coffee
4,5,cream


In [4]:
rating_df

Unnamed: 0,userId,item,rating
0,0I9ZfPXjTud0C2ovAmQKYa41dRa2,milk,10.0
1,0I9ZfPXjTud0C2ovAmQKYa41dRa2,apple,2.0
2,0I9ZfPXjTud0C2ovAmQKYa41dRa2,bread,6.0
3,0I9ZfPXjTud0C2ovAmQKYa41dRa2,banana,4.0
4,0I9ZfPXjTud0C2ovAmQKYa41dRa2,coffee,2.0
5,CUZ7QpBGv7UUNLiWzxV7,cream,5.0
6,CUZ7QpBGv7UUNLiWzxV7,coffee,5.0
7,CUZ7QpBGv7UUNLiWzxV7,milk,7.5
8,CUZ7QpBGv7UUNLiWzxV7,bread,7.5
9,g9MoGb4BWUSqcJnfpeJr,milk,5.0


In [5]:
df = pd.merge(rating_df,items_df,on='item')
df.head()

Unnamed: 0,userId,item,rating,itemId
0,0I9ZfPXjTud0C2ovAmQKYa41dRa2,milk,10.0,6
1,CUZ7QpBGv7UUNLiWzxV7,milk,7.5,6
2,g9MoGb4BWUSqcJnfpeJr,milk,5.0,6
3,0I9ZfPXjTud0C2ovAmQKYa41dRa2,apple,2.0,1
4,0I9ZfPXjTud0C2ovAmQKYa41dRa2,bread,6.0,3


In [6]:
combine_item_rating = df.dropna(axis = 0, subset = ['item'])
item_ratingCount = (combine_item_rating.
     groupby(by = ['item'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['item', 'totalRatingCount']]
    )
item_ratingCount.head()



Unnamed: 0,item,totalRatingCount
0,apple,1
1,banana,1
2,bread,3
3,coffee,3
4,cream,2


In [7]:
rating_with_totalRatingCount = combine_item_rating.merge(item_ratingCount, left_on = 'item', right_on = 'item', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,item,rating,itemId,totalRatingCount
0,0I9ZfPXjTud0C2ovAmQKYa41dRa2,milk,10.0,6,3
1,CUZ7QpBGv7UUNLiWzxV7,milk,7.5,6,3
2,g9MoGb4BWUSqcJnfpeJr,milk,5.0,6,3
3,0I9ZfPXjTud0C2ovAmQKYa41dRa2,apple,2.0,1,1
4,0I9ZfPXjTud0C2ovAmQKYa41dRa2,bread,6.0,3,3


In [8]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(item_ratingCount['totalRatingCount'].describe())

count   7.000
mean    2.000
std     1.000
min     1.000
25%     1.000
50%     2.000
75%     3.000
max     3.000
Name: totalRatingCount, dtype: float64


In [9]:
popularity_threshold = 2
rating_popular_item= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_item.head()

Unnamed: 0,userId,item,rating,itemId,totalRatingCount
0,0I9ZfPXjTud0C2ovAmQKYa41dRa2,milk,10.0,6,3
1,CUZ7QpBGv7UUNLiWzxV7,milk,7.5,6,3
2,g9MoGb4BWUSqcJnfpeJr,milk,5.0,6,3
4,0I9ZfPXjTud0C2ovAmQKYa41dRa2,bread,6.0,3,3
5,CUZ7QpBGv7UUNLiWzxV7,bread,7.5,3,3


In [10]:
rating_popular_item.shape

(11, 5)

In [18]:
## First lets create a Pivot matrix

item_features_df=rating_popular_item.pivot_table(index='item',columns='userId',values='rating').fillna(0)
item_features_df.head()

userId,0I9ZfPXjTud0C2ovAmQKYa41dRa2,CUZ7QpBGv7UUNLiWzxV7,g9MoGb4BWUSqcJnfpeJr
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bread,6.0,7.5,2.5
coffee,2.0,5.0,2.5
cream,0.0,5.0,2.5
milk,10.0,7.5,5.0


In [12]:
from scipy.sparse import csr_matrix

item_features_df_matrix = csr_matrix(item_features_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(item_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [13]:
item_features_df.shape

(4, 3)

In [28]:
query_index = np.random.choice(item_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(item_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 2)

2


In [33]:
item_features_df.head()

userId,0I9ZfPXjTud0C2ovAmQKYa41dRa2,CUZ7QpBGv7UUNLiWzxV7,g9MoGb4BWUSqcJnfpeJr
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bread,6.0,7.5,2.5
coffee,2.0,5.0,2.5
cream,0.0,5.0,2.5
milk,10.0,7.5,5.0


In [30]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(item_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, item_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for cream:

1: coffee, with distance of 0.05844557285308838:
