In [1]:
!pip install surprise



In [2]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [13]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, CoClustering, BaselineOnly
from surprise.model_selection.validation import cross_validate
from surprise.model_selection import GridSearchCV

In [4]:
#Load data
df = pd.read_csv("/content/gdrive/MyDrive/LDS02_k271_NguyenMinhQuan/Problem_3_RecomendationSystem_TikiOnlineShopping/review_clean.csv", index_col=0)
df

Unnamed: 0,customer_id,product_id,rating
0,100,419479,5.0
1,10010852,23362701,5.0
2,1001775,47868431,5.0
3,10023820,14033974,5.0
4,10026181,5899427,5.0
...,...,...,...
359446,9977723,53716888,5.0
359447,9979031,41446843,4.0
359448,9981162,491328,5.0
359449,9982441,20907214,5.0


In [5]:
df.describe()

Unnamed: 0,customer_id,product_id,rating
count,359451.0,359451.0,359451.0
mean,9170476.0,24342560.0,4.473725
std,6307652.0,23756600.0,1.018429
min,10.0,54665.0,1.0
25%,2116803.0,1600005.0,4.0
50%,8571090.0,14990450.0,5.0
75%,14480630.0,47321730.0,5.0
max,21013440.0,81964000.0,5.0


We can see that the rating overal is very high. More than 25% data is above 4.0.


In [6]:
product_count = len(df['product_id'].unique())
user_count = len(df['customer_id'].unique())

print(f"Sparsity: {1-(df.shape[0]/user_count/product_count):.2f}")
print(f'Number of users: {user_count}')
print(f'Number of items: {product_count}')

Sparsity: 1.00
Number of users: 251467
Number of items: 4218


# Model selection

In [7]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['customer_id', 'product_id', 'rating']], reader)

In [8]:
benchmark = []
# Iterate over all algorithms
#Can't use algothirm not using matrix factorization, because the ram will break.
algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f5f2e995e10>, <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f5f2e995e90>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7f5f2e995f10>, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7f5f2e995f50>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7f5f2e995f90>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f5f2e995e10>
Done:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f5f2e995e10> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f5f2e995e90>
Done:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f5f2e995e90> 


Starting:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7f5f2e995f10>
Done:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7f5f2e995f10> 


Starting:  

In [9]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVD,0.976265,0.687525,17.130901,1.636527
SVDpp,0.987413,0.668141,33.949724,2.935912
CoClustering,1.051899,0.68609,30.6297,1.440106
SlopeOne,1.075939,0.69453,3.161861,1.467828
NMF,1.121707,0.833697,34.894507,1.42724


We can see the SDD algothirm git the best score.

# Hyper tunning paramater

In [15]:
# define grid for testing
param_grid = {
    "n_epochs": [10, 30, 100],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.02, 0.05, 0.1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], refit=True, cv=3,n_jobs=-1)

gs.fit(data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.9705212454096683
BEST MAE: 	 0.6718153015474219
BEST params: 	 {'n_epochs': 100, 'lr_all': 0.002, 'reg_all': 0.1}


In [17]:
from datetime import datetime
print(training_parameters)
reader = Reader(rating_scale=(1, 5))

print("\n\n\t\t STARTING\n\n")
start = datetime.now()

print("> Loading data...")
data = Dataset.load_from_df(df[['customer_id', 'product_id', 'rating']], reader)
print("> OK")

print("> Creating trainset...")
trainset = data.build_full_trainset()
print("> OK")


startTraining = datetime.now()
print("> Training...")

algo = SVD(n_epochs = training_parameters['n_epochs'], lr_all = training_parameters['lr_all'], reg_all = training_parameters['reg_all'])

algo.fit(trainset)

endTraining = datetime.now()
print("> OK \t\t It Took: ", (endTraining-startTraining).seconds, "seconds")

end = datetime.now()
print (">> DONE \t\t It Took", (end-start).seconds, "seconds" )

{'n_epochs': 100, 'lr_all': 0.002, 'reg_all': 0.1}


		 STARTING


> Loading data...
> OK
> Creating trainset...
> OK
> Training...
> OK 		 It Took:  120 seconds
>> DONE 		 It Took 122 seconds


In [18]:
## SAVING TRAINED MODEL
from surprise import dump
filePath = '/content/gdrive/MyDrive/LDS02_k271_NguyenMinhQuan/Problem_3_RecomendationSystem_TikiOnlineShopping/model/suprise_model.pkl'
dump.dump(filePath, algo=algo)

In [22]:
df.head()

Unnamed: 0,customer_id,product_id,rating
0,100,419479,5.0
1,10010852,23362701,5.0
2,1001775,47868431,5.0
3,10023820,14033974,5.0
4,10026181,5899427,5.0


In [26]:
def recommend(userId, top_n):
  #Select id of product was not bought by user
  product_bought = df[df['customer_id'] == userId]
  list_product = df['product_id'].unique()
  cond = ~np.isin(list_product, product_bought['product_id'])
  product_not_bought = np.where(cond, list_product, list_product)

  recommend_product = pd.DataFrame(product_not_bought, columns=['product_id'])
  recommend_product['estimation_score'] = recommend_product['product_id'].apply(lambda x: algo.predict(userId, x).est)
  return recommend_product.nlargest(top_n, 'estimation_score')

In [20]:
type(df['product_id'].unique())

numpy.ndarray

In [28]:
recommend(10023820, 100)

Unnamed: 0,product_id,estimation_score
396,52769271,5.000000
726,71896003,5.000000
774,1464043,5.000000
951,44012580,5.000000
1311,50421562,5.000000
...,...,...
2216,62737062,4.925184
3100,15350656,4.923183
773,70765393,4.922720
2713,13764494,4.921804
