In [2]:
import pandas as pd
import numpy as np
# from surprise import Reader, Dataset, SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, CoClustering, BaselineOnly
from surprise import *
from surprise.model_selection.validation import cross_validate

In [3]:
df = pd.read_csv("Products_ThoiTrangNam_rating_raw.csv", sep='\t')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024482 entries, 0 to 1024481
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   product_id  1024482 non-null  int64 
 1   user_id     1024482 non-null  int64 
 2   user        1024482 non-null  object
 3   rating      1024482 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 31.3+ MB


In [5]:
df.head()

Unnamed: 0,product_id,user_id,user,rating
0,190,1,karmakyun2nd,5
1,190,2,tranquangvinh_vv,5
2,190,3,nguyenquoctoan2005,5
3,190,4,nguyenthuyhavi,5
4,190,5,luonganh5595,5


In [9]:
n_ratings = len(df)
n_product = len(df['n_product'].unique())
n_user = len(df['user_id'].unique())

In [12]:
display(n_ratings, n_product, n_user)

1024482

31267

650636

In [13]:
df['product_id'].value_counts()

1731      412
177       395
231       391
17194     389
2359      387
         ... 
121331      1
121330      1
121328      1
121326      1
26899       1
Name: product_id, Length: 31267, dtype: int64

In [14]:
# https://surprise.readthedocs.io/en/stable/reader.html
reader = Reader()
data = Dataset.load_from_df(df[['user_id', 'product_id', 'rating']], reader)

In [16]:
# https://www.youtube.com/watch?v=8wLKuscyO9I
# Singular value decomposition
algorithm = SVD()
# algorithm = KNNBasic()
# Run 5-fold cross-validation and print results
results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8804  0.8810  0.8804  0.8752  0.8822  0.8798  0.0024  
MAE (testset)     0.5598  0.5602  0.5587  0.5572  0.5593  0.5590  0.0011  
Fit time          35.21   38.91   34.58   38.11   36.48   36.66   1.65    
Test time         3.82    3.98    2.65    4.10    3.69    3.65    0.52    


In [17]:
results

{'test_rmse': array([0.88035568, 0.8809827 , 0.88041841, 0.8751605 , 0.88222695]),
 'test_mae': array([0.55982879, 0.56020666, 0.55873747, 0.55715027, 0.5593167 ]),
 'fit_time': (35.206053733825684,
  38.909791231155396,
  34.57768154144287,
  38.11235499382019,
  36.48407793045044),
 'test_time': (3.8198318481445312,
  3.9762022495269775,
  2.654552936553955,
  4.100021600723267,
  3.6907284259796143)}

In [18]:
# If the results are OK => getting full dataset => fit model
trainset = data.build_full_trainset()
algorithm.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x160d63d7a30>

### Recommender for a specific user

In [20]:
# userId = 190 has seen some movies:
userId = 190
df_select = df[(df['user_id'] == userId) & (df['rating'] >=3)]
df_select = df_select.set_index('product_id')
#df_select = df_select.join(df_title)['Name']
df_select.head(df_select.shape[0])

Unnamed: 0_level_0,user_id,user,rating
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
191,190,522rlt2il_,5


In [21]:
df_select.shape

(1, 3)

In [22]:
df_score = df[["product_id"]]

In [23]:
df_score

Unnamed: 0,product_id
0,190
1,190
2,190
3,190
4,190
...,...
1024477,171107
1024478,171107
1024479,171107
1024480,171107


- Once the model has been evaluated to our satisfaction, then we can re-train the model using the entire training dataset

In [24]:
# The following are the top 5 movies to be recommended to the user with userId
# To recommend products (i.e., movies) to the given user,
# we can sort the list of movies in decreasing order of predicted ratings
# and take the top N movies as recommendations:
df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(userId, x).est) # est: get EstimateScore
df_score = df_score.sort_values(by=['EstimateScore'], ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(userId, x).est) # est: get EstimateScore


In [25]:
df_score = df_score.drop_duplicates()

In [26]:
df_score.head()

Unnamed: 0,product_id,EstimateScore
512241,25183,5.0
530719,25246,5.0
127610,21830,5.0
876059,17123,5.0
127649,21831,5.0


In [27]:
df_score[df_score.EstimateScore>=3]

Unnamed: 0,product_id,EstimateScore
512241,25183,5.000000
530719,25246,5.000000
127610,21830,5.000000
876059,17123,5.000000
127649,21831,5.000000
...,...,...
294684,2352,3.018561
872941,1788,3.013155
134976,211011,3.008413
651581,1311,3.007838
