In [1]:
#import the reqired libraries
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import math
import json
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')



In [12]:
# Import the dataset
df = pd.read_csv('collaborative.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
print(df.shape)

(1000000, 3)


In [13]:
df.columns

Index(['user-id', 'product-id', 'reviews'], dtype='object')

In [14]:
#Find the minimum and maximum ratings
print('Minimum rating is: %d' %(df.reviews.min()))
print('Maximum rating is: %d' %(df.reviews.max()))

Minimum rating is: 1
Maximum rating is: 5


In [15]:
#Check for missing values
print('Number of missing values across columns: \n',df.isnull().sum())

Number of missing values across columns: 
 user-id       0
product-id    0
reviews       0
dtype: int64


In [16]:
# Number of unique user id  in the data
print('Number of unique users in Raw data = ', df['user-id'].nunique())
# Number of unique product id  in the data
print('Number of unique product in Raw data = ', df['product-id'].nunique())

Number of unique users in Raw data =  5001
Number of unique product in Raw data =  44424


In [17]:
#Check the top 10 users based on ratings
most_rated=df.groupby('user-id').size().sort_values(ascending=False)[:10]
print('Top 10 users based on ratings: \n',most_rated)

Top 10 users based on ratings: 
 user-id
2597    258
1814    255
580     253
3422    251
234     251
742     247
3486    246
686     246
3868    246
4639    243
dtype: int64


In [18]:
counts=df['user-id'].value_counts()
df_final=df[df['user-id'].isin(counts[counts>=15].index)]
print('Number of users who have rated 25 or more items =', len(df_final))
print('Number of unique users in the final data = ', df_final['user-id'].nunique())
print('Number of unique products in the final data = ', df_final['product-id'].nunique())

Number of users who have rated 25 or more items = 1000000
Number of unique users in the final data =  5001
Number of unique products in the final data =  44424


In [19]:
#constructing the pivot table
final_ratings_matrix = df_final.pivot(index = 'user-id', columns ='product-id', values = 'reviews').fillna(0)
final_ratings_matrix

product-id,1163,1164,1165,1525,1526,1528,1529,1530,1531,1532,...,59990,59991,59992,59993,59994,59995,59996,59998,59999,60000
user-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

Shape of final_ratings_matrix:  (5001, 44424)


In [21]:
#Split the data randomnly into train and test datasets into 70:30 ratio
train_data, test_data = train_test_split(df_final, test_size = 0.3, random_state=0)
train_data.head()

Unnamed: 0,user-id,product-id,reviews
823666,4118,33155,5
70066,352,46839,5
591943,2963,46293,1
578645,2896,8612,5
675461,3379,42804,1


In [11]:
#define user index from 0 to 10
final_ratings_matrix['user_index'] = np.arange(0, final_ratings_matrix.shape[0], 1)
final_ratings_matrix.head(20)

product-id,1163,1164,1165,1525,1526,1528,1529,1530,1531,1532,...,59991,59992,59993,59994,59995,59996,59998,59999,60000,user_index
user-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


In [30]:
#Singular Value Decomposition
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix
sparse_ratings_matrix = csc_matrix(final_ratings_matrix)
U, sigma, Vt = svds(sparse_ratings_matrix, k=10)

In [31]:
print('Left singular matrix: \n',U)

Left singular matrix: 
 [[-0.01384159  0.06745486 -0.00424561 ...  0.03013169 -0.01090254
  -0.01664181]
 [-0.00596262  0.00079281 -0.00635326 ... -0.00610051 -0.00091896
  -0.01311313]
 [ 0.01695084 -0.00817492 -0.04965002 ...  0.01150083 -0.01362015
  -0.01473207]
 ...
 [ 0.01650944 -0.01725929  0.0039299  ...  0.01225143 -0.00314817
  -0.0144938 ]
 [ 0.00624203 -0.0018889   0.02066474 ... -0.01972667  0.04402828
  -0.01656409]
 [ 0.0123787   0.00271687 -0.00236943 ...  0.00154919  0.00495563
  -0.01300628]]


In [32]:
print('Sigma: \n',sigma)

Sigma: 
 [ 74.90962739  74.92747857  74.97559005  75.06194712  75.09535655
  75.15749236  75.22385701  75.29729004  75.37009439 252.98133032]


* As sigma is not a diagonal matrix we have to convert it into diagonal matrix.

In [33]:
sigma = np.diag(sigma)
print('Diagonal matrix: \n',sigma)

Diagonal matrix: 
 [[ 74.90962739   0.           0.           0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.          74.92747857   0.           0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.          74.97559005   0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.           0.          75.06194712   0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.           0.           0.          75.09535655
    0.           0.           0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
   75.15749236   0.           0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.          75.22385701   0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.      

In [34]:
print('Right singular matrix: \n',Vt)

Right singular matrix: 
 [[-0.00029506  0.00298607 -0.00784074 ...  0.0022173  -0.00154869
   0.01378408]
 [-0.0012417  -0.0033092   0.00257619 ... -0.00306135 -0.01061056
   0.01408908]
 [-0.00890914 -0.0097526  -0.00443921 ...  0.00047694 -0.00052661
  -0.00568996]
 ...
 [ 0.0008639   0.00151711 -0.00185423 ... -0.00289292  0.00636531
   0.00909746]
 [ 0.00124703  0.0042374   0.00824249 ... -0.00398968 -0.00612632
  -0.00165827]
 [-0.00291877 -0.00452402 -0.00588894 ... -0.0046548  -0.00549004
  -0.00580688]]


In [36]:
#Predicted ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
# Convert predicted ratings to dataframe
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = final_ratings_matrix.columns)
preds_df.head()

product-id,1163,1164,1165,1525,1526,1528,1529,1530,1531,1532,...,59990,59991,59992,59993,59994,59995,59996,59998,59999,60000
0,0.002369,0.00018,0.050286,0.037185,-0.018834,-0.004535,0.028688,-0.010551,0.013437,0.053942,...,0.03807,-0.019708,0.005537,0.021396,-0.025644,0.060163,0.035595,0.016137,-0.027196,0.091866
1,0.013201,0.021952,0.027619,0.011277,0.007179,0.017955,0.017721,0.02465,0.006553,0.001934,...,0.00774,0.025137,0.025878,0.016251,0.007163,0.010656,0.016137,0.014119,0.000937,0.002788
2,0.047699,0.048362,0.05349,-0.056482,-0.002057,0.027183,-0.019709,0.044093,0.003786,0.017221,...,0.008884,-0.00429,0.022521,0.041112,-0.059246,0.029293,0.009955,0.014665,0.016368,0.053001
3,0.025099,0.021638,0.029895,-0.000555,-1.6e-05,0.022046,0.005338,0.033608,0.018466,0.007952,...,0.008855,0.02617,0.032033,0.022232,-0.020697,0.02591,0.012314,0.01229,0.033167,0.037106
4,0.011878,0.012362,0.006372,0.01968,0.006998,0.012123,0.010488,0.016375,0.018083,8.5e-05,...,0.01166,0.032143,0.029786,0.007511,0.010235,0.01037,0.018514,0.006294,0.028792,0.025321


In [37]:
def recommend_items(userID, pivot_df, preds_df, num_recommendations):
    # index starts at 0  
    user_idx = userID-1 
    # Get and sort the user's ratings
    sorted_user_ratings = final_ratings_matrix.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_ratings
    sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_predictions
    temp = pd.concat([sorted_user_ratings, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['user_ratings', 'user_predictions']
    temp = temp.loc[temp.user_ratings == 0]   
    temp = temp.sort_values('user_predictions', ascending=False)
    print('\nBelow are the recommended items for user(user_id = {}):\n'.format(userID))
    print(temp.head(num_recommendations))

In [38]:
userID = 7 
num_recommendations = 15
recommend_items(userID, final_ratings_matrix , preds_df, num_recommendations)


Below are the recommended items for user(user_id = 7):

                   user_ratings  user_predictions
Recommended Items                                
20876                       0.0          0.139153
9678                        0.0          0.136878
8390                        0.0          0.132240
4074                        0.0          0.124453
18816                       0.0          0.123671
49967                       0.0          0.120888
11578                       0.0          0.118301
32292                       0.0          0.116182
54077                       0.0          0.114793
41684                       0.0          0.113374
55458                       0.0          0.112249
33315                       0.0          0.112039
31658                       0.0          0.107913
14141                       0.0          0.107893
49494                       0.0          0.107386
