In [1]:
import pandas as pd
import numpy as np

import pickle

from surprise import Reader
from surprise import Dataset

from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.accuracy import rmse
from surprise.similarities import cosine

from scipy.sparse.linalg import svds

from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.width', 5000)
pd.set_option('display.max_rows', 500)     #ease of viewing
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_colwidth', 500)

## 2.0 Recommended System

In [2]:
#Please refer to 'Preparing for RecSys.ipynb' to find out how svd_df was derived
svd_df = pd.read_csv('../data/svd_df.csv')
svd_df.drop(labels=['Unnamed: 0'],axis=1,inplace=True)

In [3]:
svd_df.shape

(5612, 3)

In [4]:
svd_df.sort_values(by='UserId').head()

Unnamed: 0,UserId,ProductId,hybrid_score
3131,A1007PT85CIPMD,B0009ETA76,0.010526
5051,A1007PT85CIPMD,B0009F3POY,0.62322
3369,A100UZGZNZ9ZYN,B002ANA9QA,0.35241
2840,A100WO06OQR8BQ,B002LANN56,0.43845
2614,A100WO06OQR8BQ,B005CUU25G,0.04076


In [5]:
svd_df.hybrid_score.describe()

count    5612.000000
mean        0.157657
std         0.201277
min        -0.948285
25%         0.038012
50%         0.148079
75%         0.276674
max         0.985700
Name: hybrid_score, dtype: float64

In [6]:
reader = Reader(rating_scale=(-1, 1)) #values may range from -1 to 1. 'higher value = more positive' and vice versa.
data = Dataset.load_from_df(svd_df, reader)

## 2.1 Surprise! Package

#### 2.1.1 Benchmark

We use Surprise! to understand which model gives a better RMSE score and which hyper parameter(s) work best for our data.

In [7]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), 
                  KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=True)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2151  0.2148  0.2135  0.2145  0.0007  
Fit time          0.34    0.25    0.25    0.28    0.04    
Test time         0.03    0.01    0.02    0.02    0.01    
Evaluating RMSE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2079  0.2072  0.2096  0.2083  0.0010  
Fit time          0.55    0.66    0.68    0.63    0.06    
Test time         0.03    0.03    0.03    0.03    0.00    
Evaluating RMSE of algorithm SlopeOne on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2591  0.2576  0.2611  0.2593  0.0014  
Fit time          0.12    0.13    0.14    0.13    0.01    
Test time         0.01    0.02    0.02    0.02    0.00    
Evaluating RMSE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.3366  0.33

In [8]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.201594,0.019773,0.015735
KNNBaseline,0.204379,0.12381,0.022764
KNNBasic,0.205602,0.084064,0.021613
SVDpp,0.208263,0.632289,0.028695
SVD,0.214488,0.279891,0.019902
KNNWithMeans,0.238938,0.153604,0.038101
KNNWithZScore,0.239927,0.213899,0.030312
SlopeOne,0.259255,0.129996,0.016423
NormalPredictor,0.283871,0.011948,0.023166
NMF,0.337637,0.695774,0.017174


#### 2.1.2 Optimising SVD Algorithm Hyperparameters 

In [9]:
trainset, testset = train_test_split(data, test_size=.20, random_state=42)

In [10]:
%%time

param_grid = {'n_factors': [120, 140, 150], 
              'n_epochs': [10, 20, 30, 50], 
              'lr_all': [0.002, 0.003, 0.004],
              'reg_all': [0.02, 0.05, 0.08, 0.1]
             }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1, joblib_verbose=1)

gs.fit(data)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score     
print(gs.best_params['rmse'])

algo = gs.best_estimator['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   55.5s


0.21414865876963382
{'n_factors': 120, 'n_epochs': 20, 'lr_all': 0.003, 'reg_all': 0.08}
CPU times: user 1min 23s, sys: 614 ms, total: 1min 23s
Wall time: 2min 14s


[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  2.2min finished


In [11]:
#table of best parameters and best rmse score
results_df = pd.DataFrame.from_dict(gs.cv_results) 
results_df.sort_values(by='rank_test_rmse', ascending=True).head(1)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_n_epochs,param_lr_all,param_reg_all
18,0.214282,0.214524,0.21364,0.214149,0.000373,1,0.469042,0.018747,0.031573,0.010334,"{'n_factors': 120, 'n_epochs': 20, 'lr_all': 0.003, 'reg_all': 0.08}",120,20,0.003,0.08


In [12]:
#saving gs.best_estimator['rmse'] into a pickle file

svd_algo = open("svd_algo.pickle","wb") 
pickle.dump(algo, svd_algo)
svd_algo.close()

In [13]:
#loading gs.best_estimator['rmse'] pickle file

#best_est = open("svd_algo.pickle",'rb') 
#svd = pickle.load(best_est)
#best_est.close()

In [14]:
# Algorithm on trainset and testset
algo.fit(trainset)
test_pred = algo.test(testset)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)

SVD : Test Set
RMSE: 0.2097


0.20974813009855456

#### 2.1.3 Surprise! Predictions on data

In [15]:
pred_df = pd.DataFrame(test_pred)                    #r_ui = original hybrid_score
pred_df.drop(labels='details',axis=1,inplace=True)   #est = estimated score with SVD algo

In [16]:
pred_df['error'] = (pred_df['est'] - pred_df['r_ui'])
pred_df1 = pred_df[pred_df['error'].between(-0.01, 0.01, inclusive=True)] 

In [17]:
best_predictions = pred_df1.sort_values(by='error')[:10]
worst_predictions = pred_df.sort_values(by='error')[-10:]

In [18]:
best_predictions

Unnamed: 0,uid,iid,r_ui,est,error
377,AH9AUW175XCVP,B004ZIER34,0.249977,0.240348,-0.009629
361,A2K3KOPUI64TT4,B001LGGH2C,0.124352,0.115094,-0.009259
1001,AC6LV5D2RKTIR,B000GFYRIU,0.15926,0.150659,-0.008601
533,A1E80QJ5PPLC7N,B00061MVTG,0.161841,0.153664,-0.008177
809,A3OXRFCJI67IMN,B004342XH2,0.180819,0.173312,-0.007507
703,A2UJL998OQ2COG,B002KADFNC,0.13032,0.123734,-0.006586
424,A1Z54EM24Y40LL,B0012KH08M,0.167851,0.162114,-0.005737
729,A3UKWQS8SRW6IO,B000EVSYI0,0.168393,0.164748,-0.003645
194,A28P0QPSXBJTN,B000V9PH4O,0.161957,0.158319,-0.003637
1101,A2KBFB6A2D7PNO,B000V6FU08,0.293625,0.290638,-0.002987


In [19]:
worst_predictions

Unnamed: 0,uid,iid,r_ui,est,error
661,A37MH7ICH80QOX,B0049YK1FC,-0.34314,0.248684,0.591824
496,A2KBFB6A2D7PNO,B000KAJ51U,-0.44425,0.158597,0.602847
38,A2DPYMNI2HCIOI,B003VIN0QE,-0.450399,0.174063,0.624462
629,A3NZVCL9N8CLHB,B0013NUGDE,-0.490374,0.135604,0.625978
263,A2PNOU7NXB1JE4,B001EO616S,-0.507173,0.142543,0.649717
972,A1G25CG7UWQ3XO,B000EDM7BI,-0.383355,0.273228,0.656583
651,APK4O7SCK6ARK,B000FA158Q,-0.403563,0.259417,0.662979
10,A4VMQ6ZTSXSSL,B0089SPENI,-0.507889,0.217061,0.724951
1047,AYNH2BHO8SO52,B003QNJYXM,-0.70884,0.139615,0.848455
430,A2S78HC3GA9W8M,B001TNW23U,-0.70047,0.158537,0.859007


In [20]:
#saving pred_df into pickle file

svdprediction = open("svd_predictions.pickle", "wb")
pickle.dump(pred_df, svdprediction)
svdprediction.close()

In [21]:
#loading pred_df

#svdprediction = open("svd_predictions.pickle",'rb')
#pred_df = pickle.load(svdprediction)
#svdprediction.close()

## 2.2 Matrix Factorisation - SVD

For Capstone part 1, we will use SVD as our main algorithm. In part 2, we will adopt BaselineOnly algorithm as it gives a lower RMSE score for our data.

In [22]:
svd_df_pivot = svd_df.pivot_table(index='UserId',columns='ProductId',values='hybrid_score').fillna(0)
svd_df_pivot_matrix = svd_df_pivot.as_matrix()

  


In [23]:
svd_df_pivot_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
print(svd_df_pivot.shape)
svd_df.nunique()

(2172, 2422)


UserId          2172
ProductId       2422
hybrid_score    5585
dtype: int64

In [25]:
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(svd_df_pivot_matrix, k = 120, maxiter = 20) #'{'n_factors': 120, 'n_epochs': 20, 'lr_all': 0.004, 'reg_all': 0.1}

In [26]:
print(U.shape)
print(Vt.shape)
sigma = np.diag(sigma)
print(sigma.shape)

(2172, 120)
(120, 2422)
(120, 120)


In [27]:
predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
cf_df = pd.DataFrame(predicted_ratings, index= svd_df_pivot.index, columns = svd_df_pivot.columns)

In [28]:
cf_df.head()

ProductId,B00004RAMX,B000084E6V,B000084ETV,B000084F1I,B000084F1Z,B00008DF91,B00008DFNV,B00008WUA9,B0000A0BS8,B0000AH3QW,B0000C69FB,B0000CDEPD,B0000CG4I0,B0000CNU1X,B0000CNU2Q,B0000D8DI0,B0000D9N59,B0000DCWWI,B0000DG86X,B0000DGF9V,B0000DJDJZ,B0000DJT3C,B0000E65WB,B0000EIEDS,B0000GH6U6,B0000GH6UQ,B0000GIVA0,B0000GIVDC,B0000TBK64,B0000TL6CC,B0000TLEEW,B0000VLTZY,B0000VLU0I,B0001217BS,B00012182G,B000121BY6,B00012OHZ6,B00013C2MA,B00013C2TS,B00013EWNM,B00014IVPQ,B0001590LO,B00015HOTE,B00015HOUS,B00016AU3K,B00016JGY4,B00016LA9I,B00016UX0K,B000173IHE,B00017LEXO,B00017LEY8,B00018CWLG,B00018CX06,B00018CX60,B000197ZQM,B0001BGU0C,B0001BGU3Y,B0001BVD04,B0001BVO9Y,B0001CXRLQ,...,B007FK3HHG,B007FRDXMI,B007H13SYA,B007HOWZJQ,B007I7Z3Z0,B007JBLLK6,B007JFMH96,B007K449CE,B007N04BY6,B007OSBEV0,B007OSBGOK,B007OXJJE4,B007OXJK3Y,B007OXJLM4,B007PA33MA,B007PA34DS,B007PE7ANY,B007POA2L6,B007POT6RM,B007R1PGVS,B007RJELUM,B007RTR89S,B007RTR8AC,B007RTR8TS,B007RTR9E2,B007TGDXMK,B007TGDXMU,B007TGDXNO,B007TGO1U8,B007TJGY46,B007TJGY4Q,B007TJGZ54,B007TJGZ5E,B007XXLWHW,B0080YLBTM,B00817GYZO,B0081XIA1E,B0085G4A7U,B0085G4ACA,B0085RVY0A,B0085V3YFO,B00866AM2G,B0087GH4US,B0089Q2AAA,B0089SPDUW,B0089SPENI,B008BLFCK8,B008C2JCUW,B008EG59KS,B008FHUDW0,B008O2EHNC,B008OV8RE8,B008QLRJH2,B008RWUKXK,B008Z4VAPM,B008ZRKZSM,B0090X8IPM,B0090X8JUG,B0092X7OGY,B0096EZHM2
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1
A1007PT85CIPMD,1.900386e-08,1.094901e-06,0.005649932,0.000124912,5.731913e-06,-3.807087e-07,4.700181e-18,-0.0001445724,-0.000403327,-6.158736e-18,-0.001353531,1.261607e-05,-0.0002294644,-0.0001201503,1.411536e-05,-1.766223e-06,-0.0011128,-6.571366999999999e-19,3.3125609999999996e-19,6.932449e-05,0.0006993328,0.001798528,-1.464521e-05,-0.0003833315,-2.009398e-06,-0.001750652,-3.227368e-18,-0.002569856,-2.235948e-05,-6.896679e-17,-1.722288e-05,-5.221104e-08,-4.354803e-05,0.0001190601,-4.378709e-05,-0.0008163246,7.030277e-05,-6.176146e-06,7.239649e-06,2.396684e-05,-0.0002065322,-0.0005499148,-0.000801729,0.006816554,-0.001624297,-7.483201e-06,4.958336e-05,-6.293768e-06,0.0001030006,1.0441410000000001e-18,2.531055e-05,4.295069e-18,-5.687375e-07,9.239204000000001e-17,8.291795e-05,-8.750753e-05,-1.503251e-05,0.002892753,5.283136e-07,4.8348309999999996e-20,...,0.0001166905,6.44209e-05,2.113732e-06,6.05594e-18,-0.0043406,8.813724e-07,0.00074301,-0.0007248008,2.92914e-05,0.0003520016,1.889667e-05,0.00203294,0.005462327,-0.0005332642,-0.0003142666,-1.741159e-09,0.001283199,0.0001231279,-0.0004226071,-8.804645e-05,1.518069e-18,0.004745986,0.002270525,0.0008821839,-0.006204698,-0.005638213,0.0002758668,-0.000443512,-0.0001335547,-0.0001681903,0.00158576,-8.215199e-18,0.0003798408,0.0007016165,8.178852e-19,3.13337e-19,-9.263362e-08,-1.200638e-05,-1.400315e-18,0.0004010611,3.8062020000000004e-18,-2.461885e-08,0.01056206,-1.529069e-05,-0.004813079,0.000562378,0.0002714701,-1.524409e-06,-1.241066e-05,-0.002105232,-9.187853e-07,-1.000665e-05,-0.0001343785,-0.0003691994,1.751553e-06,-5.087143e-06,0.005275178,0.0004803073,-5.681667e-05,7.141231e-05
A100UZGZNZ9ZYN,-3.357044e-25,-6.113129e-25,1.081556e-20,-3.932414e-21,4.0427260000000003e-25,1.4175219999999999e-24,6.0198209999999995e-37,5.488328e-22,-1.421008e-21,1.486893e-36,-1.509011e-21,-1.134507e-23,2.0028370000000001e-22,-5.852211e-21,-9.876124e-22,-1.070346e-21,-2.360985e-21,4.269883e-37,7.821806e-38,-6.221856000000001e-23,1.1482179999999999e-20,1.1537469999999999e-20,4.895478e-22,-4.273568e-22,-7.335309000000001e-22,3.752862e-22,-5.493943e-37,2.303898e-20,-7.585463e-23,-5.045629e-36,4.762124000000001e-22,-1.0515419999999999e-26,-4.8550380000000005e-23,-1.924205e-22,8.035519000000001e-23,2.005331e-20,5.681548000000001e-23,1.582111e-21,1.767582e-21,-1.176483e-22,-2.007092e-22,-6.130833e-22,8.012878e-22,-2.749949e-21,7.833644e-21,-3.40448e-25,2.010269e-22,-5.965196e-24,-2.05887e-23,2.260617e-37,-2.911862e-21,-3.046235e-37,-2.4034090000000003e-23,-2.089873e-35,-3.236299e-22,5.695076000000001e-23,9.608874e-24,1.3050979999999999e-20,1.9335870000000002e-23,1.859369e-38,...,-5.70551e-23,5.457092e-24,-2.148668e-26,3.383157e-37,7.791638e-21,5.3471589999999996e-24,-2.231607e-20,-4.266081e-22,-1.11867e-21,5.516056e-22,-9.389277000000001e-23,2.359725e-21,4.478438e-22,5.006088000000001e-22,3.52971e-23,-1.671397e-25,1.308007e-21,-6.858385000000001e-22,-1.052872e-21,7.027439000000001e-23,1.3604399999999999e-36,-1.659893e-20,-2.1899919999999998e-20,-2.658939e-23,-6.971717e-21,3.644752e-21,-3.967007e-22,-1.34026e-21,2.806071e-21,-4.25987e-22,-2.066623e-21,-2.727553e-37,-2.9764479999999996e-20,-2.136697e-21,-3.304759e-37,-1.266074e-37,-9.828629e-28,3.363634e-22,6.257692e-38,1.1864429999999999e-20,4.604603e-37,-4.112831e-27,3.919191e-20,-1.359899e-23,6.0989829999999996e-21,1.968643e-20,-2.371781e-22,-5.612805e-23,5.399077000000001e-23,8.20831e-21,1.528429e-23,3.0474180000000004e-23,-7.515906000000001e-23,-3.4472029999999996e-20,-2.3567690000000002e-23,-1.1295220000000001e-23,2.9743389999999996e-20,-1.558337e-21,-2.767392e-21,-8.605625e-22
A100WO06OQR8BQ,-3.879052e-09,-2.069836e-07,-1.744426e-05,0.00039227,6.101085e-08,-1.250998e-06,1.752242e-18,-0.001003582,0.0001887492,9.818345e-19,-0.0001504203,-2.508176e-05,0.000112265,2.996395e-05,0.0002041479,-2.034555e-07,0.001267347,2.961401e-20,-1.375133e-19,-3.52601e-06,-0.001868561,0.0004599118,0.0001646492,-4.259982e-05,7.161586e-07,0.0007105212,-2.5197279999999997e-19,-8.705258e-05,1.901194e-05,1.910593e-17,0.0001647641,-1.37532e-09,-4.839571e-06,7.171403e-06,-2.379636e-05,7.251983e-05,-1.002888e-05,1.773874e-05,-1.60537e-05,1.331072e-05,0.0001157038,-6.111301e-05,-0.00205271,0.0002555298,0.0007650028,-1.300826e-08,3.993655e-06,7.055115e-07,0.0001321084,5.575401999999999e-19,0.01390597,-1.447641e-19,3.008892e-06,4.730042e-18,5.765201e-05,-9.616784e-07,-1.63399e-07,0.0002336746,-2.327247e-06,-1.597765e-20,...,4.871273e-05,-1.384903e-06,1.217142e-06,-8.866630999999999e-19,-0.0002336923,-9.483015e-07,0.01481911,0.001094844,-1.589088e-06,-4.854647e-06,5.507519e-05,-0.0009664632,0.0005732883,0.0001112115,7.683572e-06,6.220232e-12,-0.0003887287,3.882918e-06,-0.0001853939,6.657913e-06,8.952611e-19,-0.023992,0.0003833154,-0.002977269,-0.007541868,-0.001069771,2.713757e-05,0.0006859616,-0.002457889,0.0001305169,0.0004451963,-1.511003e-18,0.015565,-0.0001174175,-2.038956e-19,-7.811370999999999e-20,-3.244186e-08,-4.24234e-05,-7.280689e-20,0.0004563756,-2.9999579999999996e-19,-5.021922e-08,0.0001338797,1.794383e-06,0.0004973378,-0.002714104,-0.0003390724,1.612838e-07,-3.434686e-07,0.0006599756,-2.003439e-06,1.674641e-06,0.0002513733,-0.0008656775,9.151132e-06,-4.128744e-06,-0.007725908,-0.001096181,1.416935e-05,4.002942e-06
A103FPM7ABVMAW,-3.765871e-09,-2.842282e-07,0.0001668039,2.501437e-05,1.430053e-06,1.460817e-08,3.0348839999999997e-19,-7.632616e-05,0.0001417284,-6.426873e-18,-4.759758e-05,8.981306e-07,9.247538e-05,-2.151162e-05,1.245684e-05,1.233342e-07,-2.868289e-05,-5.780630999999999e-19,-1.641036e-23,8.138457e-06,-0.0009332565,-5.906532e-05,0.0006639844,-1.347994e-05,-1.07244e-07,8.111423e-05,6.458554999999999e-20,-0.0025783,5.424793e-06,-3.006097e-17,0.0006589956,-5.671022e-10,-1.531388e-06,2.582237e-06,1.025057e-05,0.0003088294,1.1798e-05,5.562539e-06,4.513254e-05,-1.569703e-06,3.799822e-05,-1.933802e-05,2.956603e-05,-0.0008054711,-0.000274039,-7.923951e-07,-5.537251e-06,-8.074502e-07,7.302295e-06,-2.829511e-19,-2.198382e-05,2.99995e-19,4.79103e-07,2.2742150000000002e-17,-4.993406e-05,1.738649e-06,2.940817e-07,0.001643006,-3.890211e-07,6.507292e-20,...,-1.267427e-05,1.75943e-05,-1.323011e-07,-6.056911e-19,0.0002278389,-1.546757e-06,0.0006120445,0.0003006875,-2.154979e-05,-6.214427e-05,1.035787e-05,-2.293203e-05,0.000153566,-0.0001370625,2.868275e-07,2.234092e-10,-0.0002019948,2.014055e-06,1.604079e-05,1.354002e-05,-4.582192e-18,0.001322462,4.62023e-05,-0.0008086664,0.0001886254,-0.0002587024,4.960306e-06,0.0001350227,-2.901113e-05,5.450197e-05,0.0009934282,5.596493e-18,-0.0001915311,4.404709e-06,-4.1453719999999994e-19,-1.5881179999999998e-19,4.279307e-09,8.4123e-05,1.2389389999999998e-19,0.000154049,-5.588442999999999e-19,-6.671814e-09,-0.002157781,-1.782126e-06,-0.001067245,-0.0001435628,2.816251e-05,-2.770719e-08,6.884509e-06,-0.0005171552,-2.897368e-06,-6.282121e-08,-7.995616e-05,3.10231e-05,1.540751e-06,7.864505e-07,-0.001578278,0.0001583576,-1.017241e-05,2.022832e-06
A106ZCP7RSXMRU,5.746364e-09,-2.835399e-08,-0.0002234826,1.667051e-05,4.242912e-08,2.780852e-08,1.5108829999999998e-19,-1.683586e-05,4.654259e-06,-5.289226e-20,-4.348669e-05,-5.163451e-07,4.274013e-06,-7.364817e-06,7.679168e-06,-6.160291e-08,1.865855e-05,4.7786149999999996e-20,5.930204e-22,-4.334844e-07,0.0001027055,-0.0001425695,1.898595e-05,-1.231582e-05,-9.05721e-09,4.743328e-05,1.679101e-20,5.275103e-05,-2.694056e-07,-5.469049999999999e-19,1.885454e-05,-8.253775e-10,-1.399125e-06,-2.272685e-06,-8.152115e-07,8.371815e-05,4.133859e-07,3.472655e-06,1.894889e-05,1.288396e-06,-0.0001013383,-1.766784e-05,-4.965791e-05,-0.0001834592,-0.000152538,-7.889572e-08,1.216848e-06,-7.167248e-07,1.71115e-06,-3.598458e-21,4.002333e-05,-8.069334e-22,2.109287e-07,5.747466e-19,4.293349e-06,-1.892611e-06,-3.228131e-07,-2.68054e-05,-1.535786e-07,4.314942e-22,...,-7.04012e-06,-6.858396e-07,-2.046718e-08,-2.970208e-21,7.221248e-05,9.980872e-08,0.0005045985,4.744336e-05,-1.526384e-05,3.503224e-06,-7.862019e-06,-1.554902e-05,6.826457e-05,9.922196e-06,6.197624e-07,4.264983e-12,-1.374312e-05,-5.475506e-07,0.0002139413,1.348458e-06,2.114302e-20,0.0004209674,-0.0001632513,-0.001329125,0.001762024,-7.410769e-05,-4.904789e-06,-0.0001054022,0.0001707499,7.093744e-07,-2.180179e-06,-3.302708e-20,-0.0008580654,-1.907296e-05,6.930622e-22,2.655165e-22,-2.394221e-11,2.161098e-05,-1.412367e-22,1.03008e-05,2.9125439999999997e-20,2.402313e-09,0.0001449354,-1.706156e-06,6.287407e-05,0.000203953,-1.31034e-05,2.828949e-07,2.144055e-06,-0.0001514139,-7.421204e-08,2.72024e-07,7.58479e-06,6.470809e-05,-1.692158e-06,3.463092e-07,-1.170343e-05,4.45152e-05,-3.482674e-06,3.955889e-06


In [29]:
cf_df.to_csv('../data/cf_df.csv')

## 2.3 Recommended Systems

#### 2.3.1 Content-Based Recommender

In [30]:
recsys_df = pd.read_csv('../data/recsys_df.csv')
recsys_df.drop(labels='Unnamed: 0',axis=1,inplace=True)

In [31]:
recsys_df.columns = ['UserId', 'ProductId', 'hybrid_score', 'product_name', 'CAT1', 'CAT2', 'CAT3', 'CAT4', 'CAT5', 'CAT6']

##### product similarity

In [32]:
product_similarity = cosine_similarity(cf_df.T)
product_similarity_df = pd.DataFrame(product_similarity, index = svd_df_pivot.columns, columns = svd_df_pivot.columns)

In [33]:
product_similarity_df.head(2)

ProductId,B00004RAMX,B000084E6V,B000084ETV,B000084F1I,B000084F1Z,B00008DF91,B00008DFNV,B00008WUA9,B0000A0BS8,B0000AH3QW,B0000C69FB,B0000CDEPD,B0000CG4I0,B0000CNU1X,B0000CNU2Q,B0000D8DI0,B0000D9N59,B0000DCWWI,B0000DG86X,B0000DGF9V,B0000DJDJZ,B0000DJT3C,B0000E65WB,B0000EIEDS,B0000GH6U6,B0000GH6UQ,B0000GIVA0,B0000GIVDC,B0000TBK64,B0000TL6CC,B0000TLEEW,B0000VLTZY,B0000VLU0I,B0001217BS,B00012182G,B000121BY6,B00012OHZ6,B00013C2MA,B00013C2TS,B00013EWNM,B00014IVPQ,B0001590LO,B00015HOTE,B00015HOUS,B00016AU3K,B00016JGY4,B00016LA9I,B00016UX0K,B000173IHE,B00017LEXO,B00017LEY8,B00018CWLG,B00018CX06,B00018CX60,B000197ZQM,B0001BGU0C,B0001BGU3Y,B0001BVD04,B0001BVO9Y,B0001CXRLQ,...,B007FK3HHG,B007FRDXMI,B007H13SYA,B007HOWZJQ,B007I7Z3Z0,B007JBLLK6,B007JFMH96,B007K449CE,B007N04BY6,B007OSBEV0,B007OSBGOK,B007OXJJE4,B007OXJK3Y,B007OXJLM4,B007PA33MA,B007PA34DS,B007PE7ANY,B007POA2L6,B007POT6RM,B007R1PGVS,B007RJELUM,B007RTR89S,B007RTR8AC,B007RTR8TS,B007RTR9E2,B007TGDXMK,B007TGDXMU,B007TGDXNO,B007TGO1U8,B007TJGY46,B007TJGY4Q,B007TJGZ54,B007TJGZ5E,B007XXLWHW,B0080YLBTM,B00817GYZO,B0081XIA1E,B0085G4A7U,B0085G4ACA,B0085RVY0A,B0085V3YFO,B00866AM2G,B0087GH4US,B0089Q2AAA,B0089SPDUW,B0089SPENI,B008BLFCK8,B008C2JCUW,B008EG59KS,B008FHUDW0,B008O2EHNC,B008OV8RE8,B008QLRJH2,B008RWUKXK,B008Z4VAPM,B008ZRKZSM,B0090X8IPM,B0090X8JUG,B0092X7OGY,B0096EZHM2
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1
B00004RAMX,1.0,-0.031951,-2e-05,-0.000368,0.005641,0.000494,-0.219149,0.007424,-0.004446,0.013934,-0.001174,-0.009274,-0.006115,-0.000561,-0.000279,4.465584e-06,0.002859,-0.151254,0.054056,-0.003932,-0.005157,0.001137,0.0367,-0.001174,-5.1e-05,0.000387,0.064956,0.000639,0.00161,0.027193,0.037388,0.005227,-0.001174,-0.001277,-0.020008,0.000926,0.000369,0.59077,0.033533,-0.001046,-0.059003,-0.001174,0.004293,-0.016799,-0.000656,-0.001525,-0.009071,0.017697,0.000133,0.007663,0.000454,0.118007,0.004135,-0.048784,-0.000653,0.004262,0.004364,-0.003239,-0.004047,-0.021901,...,0.000309,-0.01011,-0.001714,-0.13847,0.000373,-0.003129,-0.01836,-0.003938,-0.040967,-0.017706,0.01267,0.00212,-0.002118,0.007648,-0.008462,2.4e-05,-0.017995,-0.002017,0.015313,0.007057,-0.038491,-0.016797,-0.006674,-0.013131,-0.004539,0.010171,-0.00231,0.010336,0.005415,0.127276,-0.006682,0.031386,-0.001891,-0.000447,-0.069654,-0.069654,0.02842,0.039831,-0.040179,0.002814,-0.061282,0.020466,-0.009271,0.017397,0.004393,0.002259,-0.001124,0.004241,0.000759,-9.7e-05,-0.064916,0.000447,0.012149,0.0009,-0.032263,-0.000744,-0.008421,-0.015459,-0.000561,-0.000464
B000084E6V,-0.031951,1.0,0.254623,0.023676,-0.294086,-0.161908,-0.256178,0.029892,0.131885,-0.020257,-0.018807,0.019466,0.191439,0.006755,0.017496,3.490694e-07,-0.22322,0.127922,-0.044983,0.086065,0.002074,-0.014151,-0.159112,-0.018807,0.000182,-0.018887,0.182008,-0.270882,0.21368,0.162097,-0.163526,0.057003,-0.018807,0.092891,0.019807,-0.250794,0.093431,0.001916,0.020161,-0.017179,0.062623,-0.018807,0.102156,0.020644,0.001845,-0.06908,0.011412,0.034791,-0.015039,-0.024117,-0.024534,0.111505,-0.015858,0.145858,0.12035,0.057351,0.058127,-0.134756,0.015773,-0.046581,...,-0.004567,0.122259,0.135985,0.252142,-0.000252,-0.109036,0.056684,-0.202764,-0.024288,-0.141977,-0.0668,-0.031268,-0.049098,0.07341,-0.089636,0.34402,-0.061237,-0.004906,0.004356,-0.679859,0.138701,0.103087,-0.004022,0.033572,-0.024568,0.063376,0.18032,-0.042594,-0.116751,0.035601,-0.022104,0.095124,-0.005305,0.316758,0.237358,0.237358,-0.029167,-0.009002,0.031228,-0.004846,-0.088573,0.015994,-0.072378,0.021197,0.056046,0.010948,0.067794,0.001304,-0.073529,-0.022917,-0.072526,-0.316758,0.150681,-0.006445,-0.014095,0.008088,-0.146154,0.129428,0.006755,-0.028697


In [34]:
product_similarity_df.to_csv('../data/product_similarity_df.csv')

In [35]:
#product_similarity_df = pd.read_csv('../data/product_similarity_df.csv')

In [36]:
content_based_df = recsys_df.set_index('ProductId')
#content_based_df1.index will be used for indices for our content based filtering
content_based_df1 = pd.pivot_table(content_based_df,index='ProductId',columns=['UserId'],values='hybrid_score').fillna(0)
content_based_df1.head()

UserId,A1007PT85CIPMD,A100UZGZNZ9ZYN,A100WO06OQR8BQ,A103FPM7ABVMAW,A106ZCP7RSXMRU,A1076UA29SK59D,A1080SE9X3ECK0,A109L3WXD1SJFU,A10H24TDLK2VDP,A10IKHRUSMKP46,A10LIGIT9EGCM9,A10PEXB6XAQ5XF,A10R9LB4QJNG5X,A10TYGME2FQHO7,A10U8DJAPJJI8I,A10XLFE3T83WQM,A10ZXUNZNUJY0Z,A1194J1H29WSV,A11A9AVEM5EVU4,A11E8ZT3WEMH2Y,A11ED8O95W2103,A11EIDY6DD40CS,A11KZ906QD08C5,A11OTLEDSW8ZXD,A11T807LX2EF00,A11XAIFA10G7TS,A1205T8NP2BQ5E,A121PLHXGZXXUJ,A124PSAV4UV3BX,A124URARVE9S89,A12DQZKRKTNF5E,A12ENBT314RFXR,A12IRGQLFE4EBA,A12MQA7IMXZ7JT,A12NM11F1CCN2O,A12O5IJUK0EHIU,A12R3YGEHW7D8G,A12Y0N1S2C3YAB,A12YPC3CGHLDO5,A130VGG4P4PW5J,A131S7JQCEPFOM,A1347KUESVCYZ,A135XHGMBR0OWF,A137F1PRW4SB2Z,A13853O9CBLTEY,A13HRSMJ5TOWEZ,A13J10QRUKSLSL,A13K3ZLWAWN1EI,A13NTM92VE1U2Y,A13S959ZBAOU53,A13T0V3LHOTHDL,A144LF2QWLG1ZL,A14738H3YYX7ZC,A147FUNITGB21I,A149XXYGR6WKS9,A14BAM6KBGBWJ2,A14DV28G9OCFL0,A14EF1PPKMSEPU,A14ENWEKTHCBXR,A14HZ5EMD2WCG,...,AW7BIYHXUIZ62,AWAB7PKBO3BBT,AWBGHDHH7E51F,AWBMGLP57SAGK,AWGXF4XREHKBR,AWHZ4K1IXPFRZ,AWKZAUC0D8DYL,AWLK6NSSV0YNA,AWMZ9VHF1Q9PI,AWNSQQJ44NPBT,AWNV3TK4FNF45,AWPODHOB4GFWL,AWZR0O65DL2Q,AX0XNE6IX7N3M,AX1SE25U7P6I8,AX5JZLRL9KN9B,AX7QMRXX81L9K,AX9QZGAJOZ96O,AXC8TDCIET6LC,AXHTH0EL75SOJ,AXJGCAD36N915,AXJYL607ABWIB,AXO4PQU0XG3TG,AXQIHSF9KK7CO,AXQNEMI9N0Z2D,AXRJWP1UXPEBB,AXU3VKZE848IY,AXV5CT7AG4SYO,AXVNVV5VH5XZY,AXXWXM6K66YMZ,AY0WPNYO66YAA,AY12DBB0U420B,AY1EF0GOH80EK,AY1L1H0MUMAMC,AY1YNN6PAYNW9,AY3XPKRAMKKY7,AY54QSGO3KWEM,AY889QQ9SMKMB,AYB4ELCS5AM8P,AYDS27E60FH0A,AYGEP8I4BQ3CK,AYGIIQGSHKZNI,AYGJ96W5KQMUJ,AYHHNMEJ271NL,AYNAH993VDECT,AYNH2BHO8SO52,AYOMAHLWRQHUG,AYQWJUNE09ZWE,AYWHCM0TJ4737,AYWPUWMMWS40Y,AYWUHB7N8XGZQ,AZ5X928CQPRJN,AZBZ6AMM3Z492,AZM22KBPUN0BH,AZMTHQIU02OGB,AZNSBRQ0DS8LK,AZV26LP92E6WU,AZWRZZAMX90VT,AZXON596A1VXC,AZZFJQFHITBZ5
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1
B00004RAMX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000084E6V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000084ETV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000084F1I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B000084F1Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# create a Series for products so they are associated to an ordered numerical
indices = pd.Series(content_based_df1.index)

# function gives top 10 products in a dataframe with its details
# based on cosine similarity scores
def content_based_recommender(pid, product_similarity = product_similarity):
    
    # initializing an empty list 
    recommended_products = []
    
    # getting the index of product that matches the product id
    idx = indices[indices == pid].index[0]

    # create a Series with similarity scores in highest similarity scores above
    sim_score = pd.Series(product_similarity[idx]).sort_values(ascending = False)

    # getting the indexes the 10 most similar products
    top_10_indexes = list(sim_score.iloc[1:11].index)
    
    # populating the list with 10 most similar products
    for i in top_10_indexes:
        recommended_products.append(list(content_based_df.index)[i])
    
    #create dataframe to merge productid details
    a = pd.DataFrame(recommended_products)
    a.columns = ['ProductId']
    
    b = a.merge(content_based_df, on='ProductId',how='right')
    b_without_duplicates = b[b.ProductId.duplicated()]
    b_without_duplicates.drop_duplicates(subset=['ProductId'], keep='first',inplace=True)
    
    return b_without_duplicates.sort_values(by=['hybrid_score','CAT2', 'CAT3', 'CAT4', 'CAT5', 'CAT6'], ascending=False).head(10)

In [38]:
content_based_recommender('B000FCI6JU')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,ProductId,UserId,hybrid_score,product_name,CAT1,CAT2,CAT3,CAT4,CAT5,CAT6
2746,B000FCI6JU,A25V7KAUQZ1W1W,0.84708,"Star Kay White Pure Chocolate Extracts #18, 4 Ounce",Grocery & Gourmet Food','Pantry Staples','Cooking & Baking','Extracts & Flavoring,-,-
369,B005LMLXN0,A227OTYEQY6VX5,0.772833,"Runa Amazon Guayusa Traditional Tea, 1.5 Ounce (Pack of 4)",Grocery & Gourmet Food','Beverages','Coffee,Tea & Cocoa','Tea','Black
4848,B00139E43Q,A1JFXOFDRZ9CFQ,0.723887,"Newman's Own Organics Mints, Wintergreen, 4-Count, 3-Ounce Packages (Pack of 6)",Grocery & Gourmet Food','Candy & Chocolate','Mints,-,-,-
3817,B001E5E2GS,A1Y6YLMUA88TV7,0.69377,"Schar Naturally Gluten-Free Spaghetti, 12-Ounce Packages (Pack of 5)",Grocery & Gourmet Food','Pantry Staples','Pasta & Noodles','Pasta','Spaghetti,-
176,B000ED9LDU,AEWYUPCNDV7HY,0.686509,"Bob's Red Mill Organic Whole Golden Flaxseed, 24 Oz (4 Pack)",Grocery & Gourmet Food','Pantry Staples','Herbs,Spices & Seasonings','Single Herbs & Spices','Flax Seed
4932,B0012NUVN0,A397PBM83MO7UD,0.68048,StarMark Everlasting Treat Ball,Pet Supplies','Dogs','Toys','Balls,-,-
496,B005C3IVN8,A2W9B725TZBXOX,0.67984,"Anderson's Pure Maple Syrup, Grade A Very Dark/Grade B, 32 Ounce (Frustration Free Packaging)",Grocery & Gourmet Food','Pantry Staples','Cooking & Baking','Syrups,Sugars & Sweeteners','Maple Syrup
5600,B000V9PH3A,AL5VAC89VKZ97,0.6713,"Lipton White Tea Pyramids, Peach Mango 18 ct (Pack of 6)",Grocery & Gourmet Food','Beverages','Coffee,Tea & Cocoa','Tea','Tea Samplers
303,B001EQ5EJQ,A1IW9LSLZFW9FK,0.661567,"La Tourangelle, Roasted Walnut Oil, 16.9 Ounce Cans (Pack of 3)",Grocery & Gourmet Food','Pantry Staples','Cooking & Baking','Cooking Oils,Vinegars & Sprays','Oils'
2182,B0018KR8V0,A2SZWFJX0783BF,0.6434,"Larabar Gluten Free Bar, Key Lime Pie, 1.8 oz Bars (16 Count), Whole Food Gluten Free Bars, Dairy Free Snacks",Health & Household','Sports Nutrition','Nutrition Bars,-,-,-


In [39]:
content_based_df[content_based_df.index == 'B005C3IVN8'].sort_values(by='hybrid_score',ascending=False).head(1)

Unnamed: 0_level_0,UserId,hybrid_score,product_name,CAT1,CAT2,CAT3,CAT4,CAT5,CAT6
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
B005C3IVN8,A2W9B725TZBXOX,0.67984,"Anderson's Pure Maple Syrup, Grade A Very Dark/Grade B, 32 Ounce (Frustration Free Packaging)",Grocery & Gourmet Food','Pantry Staples','Cooking & Baking','Syrups,Sugars & Sweeteners','Maple Syrup


Our content based recommender is recommending based on cosine similarity scores between products. However, it is not doing the best job at recommending at the moment as our maple syrup product is receiving recommendations that are different from its category. I will try to do a collaborative filter to see if we can get better results.

#### 2.3.2 Collaborative Filtering

In [40]:
#cf_df = pd.read_csv('../data/cf_df.csv')
#cf_df.drop_index(inplace=True)

In [41]:
recsys_df.head(1)

Unnamed: 0,UserId,ProductId,hybrid_score,product_name,CAT1,CAT2,CAT3,CAT4,CAT5,CAT6
0,AOVROBZ8BNTP7,B001EO5QW8,0.494453,"McCANN'S Instant Irish Oatmeal, Variety Pack of Regular, Apples & Cinnamon, and Maple & Brown Sugar, 10-Count Boxes (Pack of 6)",Grocery & Gourmet Food','Breakfast Foods','Cereals','Oatmeal,-,-


In [42]:
#User-based collaborative filtering
cf_df1 = cf_df.T

In [43]:
recsys_df1 = recsys_df[['ProductId', 'product_name', 'CAT1', 'CAT2', 'CAT3', 'CAT4', 'CAT5', 'CAT6']]

In [44]:
class CFRecommender:
    
    MODEL_NAME = 'User-Based Collaborative Filtering'
        
    def __init__(self, cf_df1, recsys_df=None):
        self.cf_df1 = cf_df1
        self.recsys_df1 = recsys_df1
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend(self, user_id, topn=10, verbose=False):
        # Get and sort the user's predictions from previous step
        sorted_user_predictions = self.cf_df1[user_id].sort_values(ascending=False)\
                                   .reset_index().rename(columns={user_id: 'score'})

        recommendations_df = sorted_user_predictions.sort_values('score', ascending = False) \
                               .head(topn)
                               
        return recommendations_df
    
cf_recommender_model_user = CFRecommender(cf_df1, recsys_df1)

In [45]:
#enter a user id to get predicted scores of products
cf_recommender_model_user.recommend('A2WR1BNMZ03BKH')

Unnamed: 0,ProductId,score
0,B000084ETV,0.062839
1,B000I05WLY,0.02337
2,B000IMSSHM,0.014789
3,B004T80BYE,0.014605
4,B000YV7WBU,0.007578
5,B000IXUKLS,0.007531
6,B0009X2A60,0.004202
7,B000W5SLEU,0.004032
8,B0006U3NIU,0.003574
9,B001FA1KJO,0.002557


In [46]:
def recommender(pid):
        a = cf_recommender_model_user.recommend(pid)
        b = pd.merge(a, recsys_df1, on='ProductId',how='right')
        b.drop_duplicates(inplace=True,keep='first')
        b.sort_values(by=['score','CAT2','CAT3','CAT4','CAT5','CAT6'],ascending=False)
        return b.head(10)

In [47]:
recommender('A100UZGZNZ9ZYN')

Unnamed: 0,ProductId,score,product_name,CAT1,CAT2,CAT3,CAT4,CAT5,CAT6
0,B000VMBE8E,5.861885e-20,"Late July Organic Dark Chocolate Sandwich Cookies, 8.2-Ounce Boxes (Pack of 6)",Grocery & Gourmet Food','Pantry Staples','Herbs,Spices & Seasonings','Single Herbs & Spices,-
13,B005GRCWDU,4.936107e-20,Lavazza Crema e Gusto Espresso (Pack of 5),Grocery & Gourmet Food','Beverages','Coffee,Tea & Cocoa','Coffee','Ground Coffee
22,B001CGTN1I,4.4508739999999997e-20,"Navitas Organics Chia Seeds, 16 oz. Bag — Organic, Non-GMO, Gluten-Free",Grocery & Gourmet Food','Pantry Staples','Cooking & Baking','Nuts & Seeds','Chia Seeds,-
30,B004ZIER34,3.988797e-20,"Puroast Low Acid Coffee French Roast, 2-Ounce Bag (Pack of 10)",Grocery & Gourmet Food','Beverages','Coffee,Tea & Cocoa','Coffee','Roasted Coffee Beans
61,B0018KR8V0,3.9320669999999997e-20,"Larabar Gluten Free Bar, Key Lime Pie, 1.8 oz Bars (16 Count), Whole Food Gluten Free Bars, Dairy Free Snacks",Health & Household','Sports Nutrition','Nutrition Bars,-,-,-
79,B0087GH4US,3.919191e-20,"Starbucks Veranda Blend Blonde, K-Cup for Keurig Brewers, 160 Count",Grocery & Gourmet Food','Beverages','Coffee,Tea & Cocoa','Coffee','Single-Serve Capsules & Pods
84,B00113WU0S,3.416991e-20,"Edward & Sons Miso Cup Japanese Restaurant Style, 2.9 Ounce Pouches (Pack of 6)",Grocery & Gourmet Food','Pantry Staples','Soups,Stocks & Broths','Miso Soups,-
94,B0090X8IPM,2.9743389999999996e-20,"Starbucks Natural Fusions Vanilla Ground Coffee, 11 Ounce (Pack of 6)",Grocery & Gourmet Food','Beverages','Coffee,Tea & Cocoa','Coffee','Ground Coffee
132,B0019ZHZYO,2.7653159999999997e-20,"A Taste of Thai Curry Paste, Red, 2-Pound 3-Ounce Tub",Grocery & Gourmet Food','Pantry Staples','Sauces,Gravies & Marinades','Sauces','Asian'
135,B0045TEG2K,2.6139039999999997e-20,"Pillsbury Moist Supreme Sugar Free Devil's Food Cake Mix, 16 Ounces (Pack of 6)",Grocery & Gourmet Food','Pantry Staples','Cooking & Baking','Baking Mixes','Cakes,-


In [48]:
class CFRecommender:
    
    MODEL_NAME = 'Item-Based Collaborative Filtering'
        
    def __init__(self, cf_df, recsys_df=None):
        self.cf_df = cf_df
        self.recsys_df = recsys_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend(self, item_id, topn=10, verbose=False):
        # Get and sort the user's predictions from previous step
        sorted_user_predictions = self.cf_df[item_id].sort_values(ascending=False)\
                                   .reset_index().rename(columns={item_id: 'hybrid_score'})

        recommendations_df = sorted_user_predictions\
                               .sort_values('hybrid_score', ascending = False) \
                               .head(topn)

        return recommendations_df
    
cf_recommender_model_item = CFRecommender(cf_df, recsys_df)

In [49]:
#product id will give predicted scores for different users
cf_recommender_model_item.recommend('B000084ETV', topn=10, verbose=True)

Unnamed: 0,UserId,hybrid_score
0,A383XURHVF8ON6,0.493229
1,A1KPALLWZ73M27,0.421358
2,A23C2OZ0H4UU,0.337366
3,A235XFQ4XRFBTW,0.188485
4,A1IX0NC997O0NS,0.142938
5,A3NMA7RSO2HMBG,0.131578
6,A1AES697PC2IW5,0.067646
7,A2WR1BNMZ03BKH,0.062839
8,A1S2GUIPPNHBJW,0.059403
9,A7EK88GF8N40F,0.057332


### Thank you for getting thus far, that's all for now ~