#Recommender system for Ecommerce using Collaborative Filtering

Import the Libraries

In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=2811591 sha256=3b5420c6175209e9202113b73e32533935b867c2f62a4f0074fbddc73828bb60
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error

Read the Input Files

In [None]:
ratings = pd.read_json('/content/ratings.json')
customers = pd.read_json('/content/customers.json')
products = pd.read_json('/content/products.json')

In [None]:
ratings

Unnamed: 0,CustomerID,ProductID,Rate,CreateDate
0,103416,619,1,2018/01/01 01:36:30
1,103654,411,1,2018/01/01 01:36:35
2,103954,298,3,2018/01/01 01:36:38
3,103672,361,5,2018/01/01 01:37:15
4,103960,536,5,2018/01/01 02:36:25
...,...,...,...,...
130749,103907,501,1,2022/03/16 22:25:10
130750,103907,200,1,2022/03/16 22:49:28
130751,103907,184,1,2022/03/16 22:53:35
130752,103907,211,1,2022/03/16 23:14:47


Clean the Data

In [None]:
duplicate_rows = ratings.duplicated(subset=['ProductID', 'CustomerID'], keep='first')
ratings.drop(ratings[duplicate_rows].index,inplace=True)

In [None]:
ratings = ratings.reset_index().drop(['index'],axis=1)

In [None]:
ratings['CustomerID'].value_counts().head(10)

103996    635
103765    629
103367    620
103505    618
103907    606
103707    600
103834    598
103820    596
103758    594
103641    592
Name: CustomerID, dtype: int64

In [None]:
ratings['ProductID'].value_counts().head(10)

238    178
195    171
332    170
176    170
558    169
454    169
326    168
146    168
604    168
455    168
Name: ProductID, dtype: int64

#User based Collaborative Filtering

Pivot the table for Correlation Matrix

In [None]:
matrix = ratings.pivot(index='CustomerID',columns='ProductID',values='Rate')

In [None]:
matrix

ProductID,1,2,3,4,5,6,7,8,9,10,...,682,683,684,685,686,687,688,689,690,691
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,1.0,,,,,,1.0,1.0,,...,,1.0,1.0,,,,,,,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,,,1.0,1.0,...,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0
13874,5.0,,5.0,,4.0,,,3.0,5.0,5.0,...,3.0,5.0,3.0,,5.0,,,3.0,3.0,5.0
13879,,2.0,2.0,,2.0,,,,,2.0,...,5.0,,2.0,,,,2.0,,2.0,
13885,,,,,,,,4.0,,4.0,...,4.0,,4.0,4.0,4.0,,,4.0,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103988,,,,5.0,,,,,,5.0,...,5.0,,5.0,5.0,,,,,,5.0
103991,,,,2.0,5.0,2.0,,2.0,,5.0,...,2.0,,,5.0,2.0,,,,,
103992,,,,4.0,,4.0,,,,,...,,,,4.0,4.0,,,,,4.0
103996,5.0,4.0,2.0,4.0,,4.0,4.0,4.0,5.0,4.0,...,5.0,5.0,4.0,4.0,2.0,5.0,4.0,5.0,,4.0


In [None]:
similarity_collab = matrix.T.corr()

Get the top 10 most similar customers based on the score

In [None]:
neighbors = similarity_collab.apply(lambda x: list(pd.Series(x).nlargest(10).index), axis=1)

In [None]:
neighbors

CustomerID
3         [3, 6, 13874, 13879, 13885, 13892, 14180, 1448...
6         [3, 6, 13874, 13879, 13885, 13892, 14180, 1448...
13874     [13874, 103653, 103404, 103490, 103344, 103991...
13879     [13879, 103768, 103362, 103378, 103366, 103558...
13885     [3, 6, 13874, 13879, 13885, 13892, 14180, 1448...
                                ...                        
103988    [3, 6, 13874, 13879, 13885, 13892, 14180, 1448...
103991    [103991, 103911, 103812, 103430, 103702, 10389...
103992    [3, 6, 13874, 13879, 13885, 13892, 14180, 1448...
103996    [103996, 103336, 103654, 103362, 103901, 10355...
103997    [3, 6, 13874, 13879, 13885, 13892, 14180, 1448...
Length: 344, dtype: object

In [None]:
neighbors.apply(lambda x: list(matrix.loc[x].mean()))

CustomerID
3         [2.3333333333333335, 1.8, 3.1666666666666665, ...
6         [2.3333333333333335, 1.8, 3.1666666666666665, ...
13874     [3.857142857142857, 2.5, 2.857142857142857, 2....
13879     [2.3333333333333335, 2.75, 2.8, 3.5, 3.25, 3.0...
13885     [2.3333333333333335, 1.8, 3.1666666666666665, ...
                                ...                        
103988    [2.3333333333333335, 1.8, 3.1666666666666665, ...
103991    [3.2, 2.8, 3.25, 2.142857142857143, 2.42857142...
103992    [2.3333333333333335, 1.8, 3.1666666666666665, ...
103996    [4.333333333333333, 2.75, 2.2857142857142856, ...
103997    [2.3333333333333335, 1.8, 3.1666666666666665, ...
Length: 344, dtype: object

Find products to recommend for Example User

In [None]:
example_user =  neighbors.iloc[0]

In [None]:
products[products.Id.isin(matrix.loc[example_user].count().nlargest(5).index)]

Unnamed: 0,Id,Name,UnitPrice
7,8,HP Envy 6-1180ca 15.6-Inch Sleekbook,1460.0
208,209,Diamond Promise Ring 1/5 ct tw Round-cut Sterl...,289.0
211,212,Diamond Promise Ring 1/6 ct tw Round-cut 10K W...,399.99
242,243,Diamond Promise Ring 1/8 ct tw Round-cut Sterl...,179.0
511,512,Diamond Solitaire Necklace 1/4 ct tw Round-cut...,629.0


In [None]:
products[products.Id.isin(matrix.loc[example_user].mean().nlargest(5).index)]

Unnamed: 0,Id,Name,UnitPrice
112,113,Jordan Retro,17.0
140,141,Legume Homestay Phòng Bungalow 2 người,17.17
492,493,Lab-Created Opal and White Topaz Necklace Ster...,249.99
573,574,Hoop Earrings 14K Yellow Gold 18mm,79.99
613,614,Le Vian Denim Ombre Hoop Earrings 14K Vanilla ...,1609.99


# Item Based Collaborative Filtering

In [None]:
matrix = ratings.pivot(index='CustomerID',columns='ProductID',values='Rate')

Create Correlation matrix for products

In [None]:
similarity_scores = matrix.corr()

In [None]:
similarity_scores

ProductID,1,2,3,4,5,6,7,8,9,10,...,682,683,684,685,686,687,688,689,690,691
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.564222,0.541719,0.425275,0.507756,0.424038,0.452329,0.565371,0.476506,0.521355,...,0.482222,0.415058,0.556791,0.575688,0.358510,0.600403,0.524587,0.586272,0.440411,0.688812
2,0.564222,1.000000,0.446452,0.556767,0.614689,0.539053,0.406610,0.607130,0.567907,0.507935,...,0.647128,0.450893,0.718338,0.451940,0.296580,0.656903,0.639118,0.677759,0.668485,0.515550
3,0.541719,0.446452,1.000000,0.557122,0.577189,0.452205,0.547777,0.257769,0.408545,0.597639,...,0.591847,0.608897,0.302938,0.378253,0.498385,0.658018,0.518136,0.503980,0.566435,0.629487
4,0.425275,0.556767,0.557122,1.000000,0.393600,0.693979,0.370552,0.326280,0.448144,0.498648,...,0.480510,0.428998,0.497710,0.543184,0.592570,0.525583,0.617831,0.454982,0.700646,0.493066
5,0.507756,0.614689,0.577189,0.393600,1.000000,0.250780,0.378820,0.537398,0.566638,0.576446,...,0.496680,0.519924,0.406342,0.567380,0.261756,0.518565,0.593316,0.684981,0.660821,0.497424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,0.600403,0.656903,0.658018,0.525583,0.518565,0.574758,0.466030,0.522388,0.389932,0.532442,...,0.574067,0.417281,0.477439,0.575778,0.462367,1.000000,0.493698,0.621127,0.621938,0.526677
688,0.524587,0.639118,0.518136,0.617831,0.593316,0.520015,0.572340,0.516143,0.597341,0.435307,...,0.524125,0.590032,0.449422,0.394752,0.665750,0.493698,1.000000,0.422980,0.680296,0.522528
689,0.586272,0.677759,0.503980,0.454982,0.684981,0.443973,0.511339,0.509492,0.551028,0.600976,...,0.499572,0.623064,0.474266,0.554246,0.435841,0.621127,0.422980,1.000000,0.653183,0.619523
690,0.440411,0.668485,0.566435,0.700646,0.660821,0.578840,0.530812,0.433657,0.605772,0.524963,...,0.657622,0.516239,0.564163,0.478544,0.535281,0.621938,0.680296,0.653183,1.000000,0.464177


Find similar products based on correlation score

In [None]:
neighbors = similarity_scores.apply(lambda x: list(pd.Series(x).nlargest(10).index), axis=1)

TOP 10 Products related

In [None]:
similarity_scores.apply(lambda x: list(pd.Series(x).nlargest(10)), axis=1)

ProductID
1      [1.0, 0.7567430354480567, 0.7480073475868732, ...
2      [1.0, 0.7458714921619293, 0.7299140180240821, ...
3      [1.0, 0.8041017746310865, 0.7815303968064634, ...
4      [1.0, 0.7167085197995718, 0.7074708038678241, ...
5      [1.0, 0.711471535154754, 0.7081368600086421, 0...
                             ...                        
687    [1.0, 0.772703546139574, 0.7314693812552681, 0...
688    [1.0, 0.7506054601918695, 0.7221427622801772, ...
689    [1.0, 0.8403234337208328, 0.8007137016630653, ...
690    [1.0, 0.8048998137119767, 0.7601531200743976, ...
691    [1.0, 0.7368435961192884, 0.7339677745522388, ...
Length: 691, dtype: object

In [None]:
example_product = neighbors.iloc[0]

Display the TOP 10 products related to Example Product

In [None]:
products[products.Id.isin(example_product)]

Unnamed: 0,Id,Name,UnitPrice
0,1,Build your own computer,1200.0
10,11,Windows 8 Pro,65.0
58,59,Hành Trình Về Phương Đông,2.0
91,92,Combo Hoa ban,24.0
190,191,Vải Thiều Thanh Hà,10.99
221,222,Black/White Diamond Promise Ring 1/5 ct tw Ste...,239.0
271,272,Diamond Ring 1/4 ct tw Princess/Round Sterling...,289.0
367,368,Diamond Heart Necklace 1/10 ct tw Round-cut St...,80.0
449,450,Paw Print Necklace 1/10 ct tw Diamonds Sterlin...,209.0
557,558,Open Tube Twist Hoop Earrings 10K Yellow Gold,249.99


#Matrix Factorization Model

In [None]:
ratings['Rate'].unique()

array([1, 3, 5, 2, 4])

In [None]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

Perform cross validation on SVD Model

In [None]:
reader = Reader(rating_scale=(1,5))

train_data_surprise = Dataset.load_from_df(train_data[['CustomerID', 'ProductID', 'Rate']], reader)

#Build SVD Model
svd = SVD()
cross_validate(svd, train_data_surprise, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0267  1.0354  1.0282  1.0362  1.0267  1.0307  0.0042  
MAE (testset)     0.7083  0.7119  0.7104  0.7150  0.7061  0.7103  0.0030  
Fit time          2.13    1.09    1.22    1.64    1.13    1.44    0.40    
Test time         0.19    0.10    0.29    0.17    0.11    0.17    0.07    


{'test_rmse': array([1.02674718, 1.0353596 , 1.0282278 , 1.03624107, 1.0267475 ]),
 'test_mae': array([0.7082749 , 0.7118545 , 0.71035783, 0.71498509, 0.70614936]),
 'fit_time': (2.130587339401245,
  1.0874481201171875,
  1.2224793434143066,
  1.6359257698059082,
  1.1285810470581055),
 'test_time': (0.1943979263305664,
  0.09782242774963379,
  0.2895674705505371,
  0.17197012901306152,
  0.10670065879821777)}

In [None]:
predicts_cf = svd.test([(cid, pid, 0) for (cid, pid) in zip(test_data['CustomerID'], test_data['ProductID'])])

In [None]:
test_data['prediction_cf'] = [p.est for p in predicts_cf]

In [None]:
test_data

Unnamed: 0,CustomerID,ProductID,Rate,CreateDate,prediction_cf
226,103629,419,5,2018/01/03 14:37:58,4.537304
91803,103972,672,5,2021/08/09 22:36:42,3.540445
60235,103412,377,3,2020/03/14 03:36:00,1.783306
83444,103321,596,4,2021/03/18 05:36:48,2.007712
102646,103879,121,5,2022/03/07 01:01:24,4.912100
...,...,...,...,...,...
87210,15018,445,4,2021/05/21 08:36:29,3.936789
32546,103355,572,4,2019/02/11 03:36:27,3.946070
27162,103524,119,5,2018/12/02 06:37:58,4.688416
88282,103424,294,4,2021/06/09 15:37:10,4.331129


In [None]:
predicts_cf = svd.test([(cid, pid, 0) for (cid, pid) in zip(train_data['CustomerID'], train_data['ProductID'])])

In [None]:
train_data

Unnamed: 0,CustomerID,ProductID,Rate,CreateDate
16354,103836,250,4,2018/07/15 23:36:37
96004,103614,92,5,2021/10/23 20:36:51
56137,103308,671,5,2020/01/14 02:37:14
63158,103498,521,2,2020/04/28 15:36:47
86239,103544,162,2,2021/05/05 03:37:45
...,...,...,...,...
6265,103416,87,1,2018/03/15 02:36:30
54886,103505,61,5,2019/12/25 23:36:53
76820,103761,405,1,2020/11/30 09:35:57
860,103806,420,4,2018/01/11 08:36:20


In [None]:
train_data['prediction_cf'] = [p.est for p in predicts_cf]

Evaluate SVD Model

MSE = 1.0359623215487868


In [None]:
mean_squared_error(test_data['Rate'], test_data['prediction_cf'])**0.5

1.0359623215487868

MAE = 0.7176224448659594

In [None]:
mean_absolute_error(test_data['Rate'], test_data['prediction_cf'])

0.7176224448659594

In [None]:
final = pd.concat([train_data,test_data])

In [None]:
final

Unnamed: 0,CustomerID,ProductID,Rate,CreateDate,prediction_cf
16354,103836,250,4,2018/07/15 23:36:37,3.938392
96004,103614,92,5,2021/10/23 20:36:51,4.473828
56137,103308,671,5,2020/01/14 02:37:14,4.917095
63158,103498,521,2,2020/04/28 15:36:47,2.088471
86239,103544,162,2,2021/05/05 03:37:45,3.239390
...,...,...,...,...,...
87210,15018,445,4,2021/05/21 08:36:29,3.936789
32546,103355,572,4,2019/02/11 03:36:27,3.946070
27162,103524,119,5,2018/12/02 06:37:58,4.688416
88282,103424,294,4,2021/06/09 15:37:10,4.331129


#Compare results from collaborative filtering and SVD

SVD Suggested top items for Customer Id : 3

In [None]:
products[products.Id.isin(final[final['CustomerID']==3].sort_values('prediction_cf',ascending=False).head(10)['ProductID'])]

Unnamed: 0,Id,Name,UnitPrice
72,73,Ống Hút Kim Loại Bằng Thép Không Gỉ,2.0
119,120,Quai vải,2.0
175,176,Set inox straw,1.0
272,273,Diamond North South Ring 1/2 ct tw Round-cut 1...,1099.0
327,328,Natural Sapphire Ring 1/8 ct tw Diamonds 10K W...,899.99
447,448,Beaded Cable Chain Necklace 14K Yellow Gold 16...,399.99
487,488,Lab-Created Ruby & White Lab-Created Sapphire ...,179.99
511,512,Diamond Solitaire Necklace 1/4 ct tw Round-cut...,629.0
572,573,Hoop Earrings 14K Yellow Gold,149.99
682,683,Mission: Impossible - Fallout,2.0


Collaborative Filtering Suggested top items for Customer Id: 3

In [None]:
products[products.Id.isin(matrix.loc[example_user].count().nlargest(10).index)]

Unnamed: 0,Id,Name,UnitPrice
7,8,HP Envy 6-1180ca 15.6-Inch Sleekbook,1460.0
208,209,Diamond Promise Ring 1/5 ct tw Round-cut Sterl...,289.0
211,212,Diamond Promise Ring 1/6 ct tw Round-cut 10K W...,399.99
242,243,Diamond Promise Ring 1/8 ct tw Round-cut Sterl...,179.0
270,271,Diamond Ring 1/15 ct tw Round-cut 10K Rose Gold,469.0
351,352,Black Silicone Women's Wedding Band,23.99
381,382,Lab-Created Emerald MOM Necklace Sterling Silv...,149.99
425,426,Forever Connected Diamond Necklace 1/3 ct tw P...,799.99
443,444,Deer Necklace 1/10 ct tw Diamonds Sterling Sil...,249.0
511,512,Diamond Solitaire Necklace 1/4 ct tw Round-cut...,629.0


In [None]:
products[products.Id.isin(matrix.loc[example_user].mean().nlargest(10).index)]

Unnamed: 0,Id,Name,UnitPrice
106,107,Converse đen cổ thấp,22.0
112,113,Jordan Retro,17.0
140,141,Legume Homestay Phòng Bungalow 2 người,17.17
430,431,Disney Treasures Winnie the Pooh Diamond Neckl...,419.99
458,459,Lab-Created Emerald Necklace 1/6 ct tw Diamond...,599.99
474,475,Diamond Cross Necklace 1/4 ct tw Round-cut 10K...,619.0
492,493,Lab-Created Opal and White Topaz Necklace Ster...,249.99
573,574,Hoop Earrings 14K Yellow Gold 18mm,79.99
613,614,Le Vian Denim Ombre Hoop Earrings 14K Vanilla ...,1609.99
637,638,OPPO Reno6 Z 5G,254.0


Since there aren't any straight forward ways ti measure collaborative filtering and evaulate it other than domain knowledge, we used SVD to get recomendation and evaluated it. There are some similarities in SVD and Collaborative filtering output, but there isn't any stong corellation