Steps -

Read and explore the given dataset.  ( Rename column/add headers, plot histograms, find data characteristics)
Take a subset of the dataset to make it less sparse/ denser. ( For example, keep the users only who has given 50 or more number of ratings )
Split the data randomly into train and test dataset. ( For example, split it in 70/30 ratio)
Build Popularity Recommender model.
Build Collaborative Filtering model.
Evaluate both the models. ( Once the model is trained on the training data, it can be used to compute the error (RMSE) on predictions made on the test data.)
Get top - K ( K = 5) recommendations. Since our goal is to recommend new products to each user based on his/her habits, we will recommend 5 new products.
Summarise your insights.


In [129]:
%matplotlib inline
import sklearn as sns
import pandas as pd

from sklearn.model_selection import train_test_split
import numpy as np
import time
from sklearn.externals import joblib
import Recommenders as Recommenders
import Evaluation as Evaluation

In [87]:
df_ratings = pd.read_csv("ratings_Electronics.csv", names = ['user_id', 'productId', 'ratings', 'timestamp'])

In [88]:
df_ratings.head()

Unnamed: 0,user_id,productId,ratings,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [89]:
df_ratings.shape

(7824482, 4)

In [90]:
df_ratings.drop(columns='timestamp', inplace=True)

In [91]:
df_ratings.head()

Unnamed: 0,user_id,productId,ratings
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


In [92]:
df_ratings.isna().sum()

user_id      0
productId    0
ratings      0
dtype: int64

In [93]:
df_ratings.isnull().sum()

user_id      0
productId    0
ratings      0
dtype: int64

In [94]:
# of unique values
df_ratings.productId.nunique()

476002

In [95]:
df_ratings.ratings.nunique()

5

In [96]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 3 columns):
user_id      object
productId    object
ratings      float64
dtypes: float64(1), object(2)
memory usage: 179.1+ MB


In [97]:
df_ratings.groupby('user_id')['ratings'].nunique()

user_id
A00000262KYZUE4J55XGL    1
A000063614T1OE0BUSKUT    1
A00009182QVLSWIGHLS1B    1
A00009661LC9LQPGKJ24G    1
A00010809P09NUU6ZP6H     1
A00014061C2IZNE0YEILY    1
A000145014WOTZJ5NSKOR    1
A00015222LZ55IJSVL5IX    1
A00015228CUPGPF957DS     1
A0001528BGUBOEVR6T5U     2
A00018041RRVMCICCAP79    1
A000186437REL8X2RW8UW    1
A000187635I595IAVSQLH    1
A00019466UY2KR1IPXQN     1
A0002012T7HVDB2EF4RH     1
A0002032ZFQKDVHYKGWR     1
A0002550196XWX0PEOZND    1
A00027081JC8NE8X6TD5     1
A00027561NC7JTXEP3EOD    1
A00028781NF0U7YEN9U19    1
A00029263J863WSR0TDRS    1
A000294826HEMAY5L3K1H    1
A00029825UMZ6N0ETLHY     1
A00033481VZEEGYXEN32T    2
A00037441I8XOQJSUWCAG    1
A00038802J7X43YTW44TD    2
A000428226SAAAIBK8I36    1
A0004478EF5NFPHLGCWG     1
A00045341JXVKNK93M6JE    1
A00059641RDIAMDC7IJRZ    1
                        ..
AZZYJH0XNZ896            1
AZZYK2BT6EU8V            2
AZZYKX2KZ0Q82            1
AZZYMQZHES0KT            1
AZZYO4XQYE89O            1
AZZYQ6753ZHTK       

In [98]:
df_ratings['tot_rating'] = df_ratings.groupby('user_id')['productId'].transform('count')

In [99]:
df_ratings.head()

Unnamed: 0,user_id,productId,ratings,tot_rating
0,AKM1MP6P0OYPR,132793040,5.0,2
1,A2CX7LUOHB2NDG,321732944,5.0,4
2,A2NWSAGRHCP8N5,439886341,1.0,1
3,A2WNBOD3WNDNKT,439886341,3.0,1
4,A1GI0U4ZRJA8WN,439886341,1.0,1


In [100]:
df_ratings2 = df_ratings[df_ratings["tot_rating"] >= 50]

In [101]:
df_ratings.shape

(7824482, 4)

In [102]:
df_ratings2.shape

(125871, 4)

In [103]:
df_ratings2.head()

Unnamed: 0,user_id,productId,ratings,tot_rating
94,A3BY5KCNQZXV5U,594451647,5.0,50
118,AT09WGFUM934H,594481813,3.0,110
177,A32HSNCNPRUMTR,970407998,1.0,72
178,A17HMM1M7T9PJ1,970407998,4.0,151
492,A3CLWR1UUZT6TG,972683275,5.0,58


In [104]:
df_ratings2.drop(columns='tot_rating', axis=1, inplace=True)

In [105]:
df_ratings2.head()

Unnamed: 0,user_id,productId,ratings
94,A3BY5KCNQZXV5U,594451647,5.0
118,AT09WGFUM934H,594481813,3.0
177,A32HSNCNPRUMTR,970407998,1.0
178,A17HMM1M7T9PJ1,970407998,4.0
492,A3CLWR1UUZT6TG,972683275,5.0


In [106]:
users = df_ratings2['user_id'].unique()

In [107]:
len(users)

1540

In [108]:
train_ratings, test_ratings = train_test_split(df_ratings2, test_size = 0.30, random_state=0)

### Create an instance of popularity based recommender class

In [130]:
rating_pop_recom = Recommenders.popularity_recommender_py()

In [131]:
# rating_pop_recom.create(train_ratings, 'user

In [132]:
rating_pop_recom.create(train_ratings, 'user_id', 'productId')

In [133]:
user_id = users[20]
rating_pop_recom.recommend(user_id)

Unnamed: 0,user_id,productId,score,Rank
30847,A341HCMGNZCBIT,B0088CJT4U,133,1.0
30287,A341HCMGNZCBIT,B007WTAJTO,124,2.0
19647,A341HCMGNZCBIT,B003ES5ZUU,122,3.0
8752,A341HCMGNZCBIT,B000N99BBC,114,4.0
30555,A341HCMGNZCBIT,B00829THK0,97,5.0
30559,A341HCMGNZCBIT,B00829TIEK,97,6.0
17384,A341HCMGNZCBIT,B002R5AM7C,94,7.0
31107,A341HCMGNZCBIT,B008DWCRQW,91,8.0
17573,A341HCMGNZCBIT,B002SZEOLG,84,9.0
22744,A341HCMGNZCBIT,B004CLYEDC,82,10.0


Collaborative - Item similarity

In [134]:
rating_collab_model = Recommenders.item_similarity_recommender_py()

In [135]:
rating_collab_model.create(train_ratings, 'user_id', 'productId')

In [136]:
#Print the products for the user in training data
user_id = users[5]
user_items = rating_collab_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data product for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
rating_collab_model.recommend(user_id)

------------------------------------------------------------------------------------
Training data product for the user userid: A3TAS1AG6FMBQW:
------------------------------------------------------------------------------------
B000S5Q9CA
B009D79VH4
B00D4MFPLA
B004T9RR0E
B000HPV3RW
B00APUQPHW
B001F7HLRC
B00EVNVRM2
B0067WUVF4
B003GTSHY8
B00DQZQCUI
B00BAEVR4I
B00JXECZIY
B002WE6D44
B001VKQR5U
B003YKG2UK
B008S4TSAI
B007IO38MI
B00G6CLN3Y
B00HWMPSF6
B00JZAB8OI
B0009SHDGC
B0063705PE
B0001OHH0Q
B007HOHV9U
B00002EQC2
B00A1DJO12
B00ATZ9I9U
B007ZTKWFM
B001KN6WL2
B001963NZI
B00DT04I9W
B009SG71MC
B003X26PMO
B00CBCUS1G
0972683275
B00006I5NE
B001M4XCHQ
B002MAPRYU
B0073FE1F0
B00IWQ3Y20
B00HRQB28Y
B008LHUTKA
B0055TEQH4
B004LEAYXY
B00603RTC8
B00IVPU6AA
B00894YWD0
B001U5R9RG
B0090CVJZ4
B008X9Z528
B004NBL9WK
B004HW67MW
B00A6YOUWE
B0019HGU0M
B00HF3X5RU
B003ES5ZUU
B004LSNF04
B00C97AF16
B005CG2AX2
B009NHWVIA
B003CIBCX6
B00934CQ56
B00D6HBVIO
B0061S1INS
B000X23I22
B0011NVMO8
B00A750SCI
B000VZS2EU
B001PI09SE
B

Unnamed: 0,user_id,song,score,rank
0,A3TAS1AG6FMBQW,B008ULPE70,0.014947,1
1,A3TAS1AG6FMBQW,B008X9Z3UC,0.014423,2
2,A3TAS1AG6FMBQW,B008ULPCB8,0.014296,3
3,A3TAS1AG6FMBQW,B008X9ZBVI,0.013497,4
4,A3TAS1AG6FMBQW,B008ULPAT2,0.013336,5
5,A3TAS1AG6FMBQW,B009WZRCO6,0.013163,6
6,A3TAS1AG6FMBQW,B009YQ8BTI,0.01286,7
7,A3TAS1AG6FMBQW,B005QCDY50,0.012751,8
8,A3TAS1AG6FMBQW,B0057UUB1G,0.012215,9
9,A3TAS1AG6FMBQW,B008X9Z8NE,0.011844,10


In [137]:
user_id = users[7]

user_items = rating_collab_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data products for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
rating_collab_model.recommend(user_id)



------------------------------------------------------------------------------------
Training data products for the user userid: A25RTRAPQAJBDJ:
------------------------------------------------------------------------------------
B003YNO0LA
B000JLU2A8
B007W66RCU
B0009IG3U4
B00902SFC4
B00JC5Y6YA
B0006FK400
B00FXPL1XM
B000PAS9IU
B00JY4QCJQ
B003PL0AME
B000BMHERE
B008R6WUZC
B0002DFIOS
B0001ZYAN2
B000PH7X30
B0006DPPW4
B00017O6Q6
B00A2T6X0K
B000TJFEYO
B00BCA41PW
B004CZ9U96
B00009ZOIZ
B00BF9I0JI
0972683275
B005TDWVQY
B000EHHOOE
B0034CL2ZI
B00026BQJ6
B0002KRCHW
B00009UH9J
B005NCNDOU
B0002ZPIXM
B0002CNTU4
B000782SLO
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------
No. of unique songs for the user: 35
no. of unique songs in the training set: 38226
Non zero values in cooccurence_matrix :13679


Unnamed: 0,user_id,song,score,rank
0,A25RTRAPQAJBDJ,B0000E2XEB,0.014286,1
1,A25RTRAPQAJBDJ,B00KSBB84S,0.014286,2
2,A25RTRAPQAJBDJ,B004XJ64RM,0.014286,3
3,A25RTRAPQAJBDJ,B00B1V2FA0,0.014286,4
4,A25RTRAPQAJBDJ,B00029X21S,0.014286,5
5,A25RTRAPQAJBDJ,B001RB24LY,0.014286,6
6,A25RTRAPQAJBDJ,B00B11C6HW,0.014286,7
7,A25RTRAPQAJBDJ,B004556RMQ,0.014286,8
8,A25RTRAPQAJBDJ,B005ZCU4YK,0.014286,9
9,A25RTRAPQAJBDJ,B001VIPUFU,0.014286,10


## Quantitative comparison between the models

In [None]:
start = time.time()

#Define what percentage of users to use for precision recall calculation
user_sample = 0.05

#Instantiate the precision_recall_calculator class
pr = Evaluation.precision_recall_calculator(test_ratingsatings, train_ratings, rating_pop_recom, rating_collab_model)

#Call method to calculate precision and recall values
(pop_avg_precision_list, pop_avg_recall_list, collab_avg_precision_list, collab_avg_recall_list) = pr.calculate_measures(user_sample)

end = time.time()
print(end - start)