In [2]:
! pip install surprise



In [3]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

%cd '/content/gdrive/My Drive/LDS0_K273_ONLINE_DoThiPhuong/Topic_2/'

Mounted at /content/gdrive
/content/gdrive/My Drive/LDS0_K273_ONLINE_DoThiPhuong/Topic_2


In [4]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, CoClustering, BaselineOnly
from surprise.model_selection.validation import cross_validate

In [5]:
df = pd.read_csv('Reviews.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361090 entries, 0 to 361089
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   Unnamed: 0   361090 non-null  int64
 1   id           361090 non-null  int64
 2   customer_id  361090 non-null  int64
 3   product_id   361090 non-null  int64
 4   rating       361090 non-null  int64
dtypes: int64(5)
memory usage: 13.8 MB


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,customer_id,product_id,rating
0,0,0,709310,10001012,3
1,1,1,10701688,10001012,5
2,2,2,11763074,10001012,5
3,3,3,9909549,10001012,5
4,4,4,1827148,10001012,5


In [8]:
n_ratings = len(df)
n_products = len(df['product_id'].unique())
n_customers = len(df['customer_id'].unique())

In [9]:
display(n_ratings, n_products, n_customers)

361090

4214

251149

In [10]:
df['product_id'].value_counts()

299461      4715
1600005     2629
47321729    2419
405243      2316
8141868     2202
            ... 
73685012       1
1508575        1
1513667        1
54399906       1
56519259       1
Name: product_id, Length: 4214, dtype: int64

In [11]:
reader = Reader()
data = Dataset.load_from_df(df[['customer_id', 'product_id', 'rating']], reader)

In [12]:
# algorithm = SVD()
# results  = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv = 5, verbose = True)

In [13]:
algorithm = [SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering(), BaselineOnly()]
#  KNNBasic(), KNNBaseline(), KNNWithMeans(), KNNWithZScore(),

In [16]:
benchmark = []
for model in algorithm:
    # Perform cross validation
    results = cross_validate(model, data, measures=['RMSE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(model).split(' ')[0].split('.')[-1]], index=['Model']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Model').sort_values('test_rmse')  

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.96987,2.840784,0.645969
SVD,0.973352,19.935855,0.850277
SVDpp,0.984872,36.628077,1.524906
CoClustering,1.050876,28.42815,0.638937
SlopeOne,1.077203,3.654448,0.804788
NMF,1.122492,42.227506,0.584286


- Trong 6 model trên thì BaselineOnly() cho kết quả tốt nhất (RMSE xấp xỉ SVD va SVDpp nhưng thời gian thực hiện ngắn hơn rất nhiều)
- Em sẽ sử dụng BaselineOnly() kết hợp Tune algorithm parameters with GridSearchCV để tối ưu model.

In [15]:
from surprise.model_selection import GridSearchCV

In [17]:
param_grid = {'bsl_options':{'method': ['als','sgd'],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}}

In [18]:
gs = GridSearchCV(BaselineOnly, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimati

In [19]:
bsl_options = {'method': 'als', 'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
algorithm = BaselineOnly(bsl_options=bsl_options)

In [20]:
trainset = data.build_full_trainset()
algorithm.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7f8045fe2ed0>

# Recommender for a specific user

In [60]:
# customer_id = 14188390 has some products:
customer_id = 14188390
df_select = df[(df['customer_id'] == customer_id) & (df['rating'] >= 3)]
df_select = df_select.set_index('product_id')
df_select.head(df_select.shape[0])

Unnamed: 0_level_0,Unnamed: 0,id,customer_id,rating
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19419738,60457,60457,14188390,5
208248,68935,68935,14188390,4
24787086,85707,85707,14188390,5
25965194,94456,94456,14188390,3
299461,110293,110293,14188390,5
45163095,180079,180079,14188390,5
595607,284202,284202,14188390,3
998687,361015,361015,14188390,3


In [61]:
df_select.shape

(8, 4)

In [62]:
df_score = df[['product_id']]

In [63]:
df_score

Unnamed: 0,product_id
0,10001012
1,10001012
2,10001012
3,10001012
4,10001012
...,...
361085,9996258
361086,9996258
361087,9996258
361088,9996258


In [64]:
df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(customer_id, x).est) # est: get EstimateScore
df_score = df_score.sort_values(by=['EstimateScore'], ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [65]:
df_score = df_score.drop_duplicates()

In [66]:
df_score.head()

Unnamed: 0,product_id,EstimateScore
237331,53080935,4.641777
305513,68025746,4.641678
324562,73179180,4.633438
319924,71896003,4.630561
313577,70771651,4.627334


In [67]:
def recommender(customer_id):
  df_score = df[['product_id']]
  df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(customer_id, x).est) # est: get EstimateScore
  df_score = df_score.sort_values(by=['EstimateScore'], ascending=False)
  df_score = df_score.drop_duplicates()
  df_score = df_score[df_score.EstimateScore >= 3.0]
  results = df_score.head()
  return results

In [68]:
recommender_14188390 = recommender(14188390)
recommender_14188390

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,product_id,EstimateScore
237331,53080935,4.641777
305513,68025746,4.641678
324562,73179180,4.633438
319924,71896003,4.630561
313577,70771651,4.627334


Sau khi xây dựng model theo 2 cách: ALS (BigData) và BaseLineOnly(Machine Learning) thì em sử dụng ALS để đề xuất cho người dùng vì có RMSE thấp hơn. (ALS là 0.295 và BaseLineOnly là 0.97)