# Data

In [64]:
import os
import pandas as pd

In [110]:
df = pd.read_csv("transaction")
df.head()

Unnamed: 0.1,Unnamed: 0,id,user_id,restaurant_id,created_at
0,8246,1,3FC1CB1AEED7BE57,4F45F749D78D0EBE,2022-06-24 00:00:14.168
1,15705,2,EB3C5BB1F0E1D977,1A811BB0FAE6DA51,2022-06-24 00:00:14.668
2,22017,3,B174219506B1BF3A,49043A7EFB6BB580,2022-06-24 00:00:33.152
3,15615,4,FD33853938DD7ECE,2CBBCC8BCD69688B,2022-06-24 00:00:50.873
4,7070,5,D6AD85685EEED5AC,0445647F26E9DDFD,2022-06-24 00:00:53.170


In [71]:
print('shape:', df.shape)
print('unique restaurant_id:', len(pd.unique(df['restaurant_id'])))
print(df['restaurant_id'].value_counts())

shape: (30000, 5)
unique restaurant_id: 12598
A7FD8C847677D6B0    59
65A73F3D990FF244    54
B36C9625A8B576B0    47
53054BDA3701C783    43
0DD4153705CBC8DB    42
                    ..
D2F8D0767C4A84EE     1
B2F3FA9053256E6C     1
EC5588EB9FFA4C79     1
647D587400A9BF55     1
7F30B27851B47722     1
Name: restaurant_id, Length: 12598, dtype: int64


In [72]:
df = df[['user_id', 'restaurant_id']]
print(df.shape)
df.head()

(30000, 2)


Unnamed: 0,user_id,restaurant_id
0,3FC1CB1AEED7BE57,4F45F749D78D0EBE
1,EB3C5BB1F0E1D977,1A811BB0FAE6DA51
2,B174219506B1BF3A,49043A7EFB6BB580
3,FD33853938DD7ECE,2CBBCC8BCD69688B
4,D6AD85685EEED5AC,0445647F26E9DDFD


In [55]:
df_frequency = df.groupby(df.columns.tolist(), as_index=False).size()
print(df_frequency['size'].sum())
df_frequency.sort_values(by=['size'], ascending=False)

30000


Unnamed: 0,user_id,restaurant_id,size
25138,D77101ABBD55A46E,1231622E0D4745F7,4
20410,AF1859F2CB3D57D8,7D952972CD43E190,3
19804,AA288DC7E3C46779,A7FD8C847677D6B0,3
3163,1B26A8D907C9C331,D3E7D5ADD2CDD274,3
13668,75860D1893D29DFC,EFAB3E40F4EF201E,3
...,...,...,...
9974,554C3C0195523F9F,0E19B5CAA99B0A6A,1
9973,554C02BCCA408E30,9046073717134017,1
9972,554A852A0EFF95F2,D30CEC8BA2D99FC7,1
9971,5546D6851F843251,3F161CC8E74671A7,1


In [61]:

# copy the data
df_max_scaled = df_frequency.copy()
  
# apply normalization techniques by Column 1
column = 'size'
df_max_scaled[column] = df_max_scaled[column] /df_max_scaled[column].abs().max()
  
# view normalized data
display(df_max_scaled)

Unnamed: 0,user_id,restaurant_id,size
0,0000475F1EFEF93F,30FF5B178F133C2E,0.25
1,000155B04C8D8CC8,1684A4CC1DCAEBE6,0.25
2,0003E1FDB847FCE8,4F202F894DEF5D45,0.25
3,0003E1FDB847FCE8,E24561053B2E55EF,0.25
4,0005C2C51293545B,63FFF7BB8DDD856D,0.25
...,...,...,...
29884,FFF526DEA3D970F6,821C587D566DBD5F,0.25
29885,FFF90B1B03567F23,EAD3408E1A579D53,0.25
29886,FFFB8AD268144C72,569DB6764A5181A4,0.25
29887,FFFB8AD268144C72,A660BB1CC57C3D28,0.25


# Model

In [91]:
#Import packages
from surprise import NormalPredictor, Reader, Dataset, accuracy, SVD, SVDpp, KNNBasic, CoClustering, SlopeOne
from surprise.model_selection import cross_validate, KFold, GridSearchCV, train_test_split
import numpy as np


In [92]:
def create_user_ratings_df(data):
    df = data.groupby(data.columns.tolist(), as_index=False).size().reset_index()
    
    df = df.rename({'size':'freq'}, axis=1)
    return df

def surprise_df(data):
    
    scale = (data.freq.min(), data.freq.max())
    reader = Reader(rating_scale=scale)

    df = Dataset.load_from_df(data[['user_id',
                                    'restaurant_id',
                                    'freq']], reader)
    
    return df


In [93]:
user_ratings_df = create_user_ratings_df(df)
user_ratings_df.sort_values(by=['freq'], ascending=False)
# user_ratings_df.head()

Unnamed: 0,index,user_id,restaurant_id,freq
25138,25138,D77101ABBD55A46E,1231622E0D4745F7,4
20410,20410,AF1859F2CB3D57D8,7D952972CD43E190,3
19804,19804,AA288DC7E3C46779,A7FD8C847677D6B0,3
3163,3163,1B26A8D907C9C331,D3E7D5ADD2CDD274,3
13668,13668,75860D1893D29DFC,EFAB3E40F4EF201E,3
...,...,...,...,...
9974,9974,554C3C0195523F9F,0E19B5CAA99B0A6A,1
9973,9973,554C02BCCA408E30,9046073717134017,1
9972,9972,554A852A0EFF95F2,D30CEC8BA2D99FC7,1
9971,9971,5546D6851F843251,3F161CC8E74671A7,1


In [94]:
user_ratings_matrix = surprise_df(user_ratings_df)
train_set, test_set = train_test_split(user_ratings_matrix, test_size=0.2, random_state=19)

In [100]:
kf = KFold(n_splits=5, shuffle=True, random_state=19)
def model_framework(train_data):
    #store the rmse values for each fold in the k-fold loop 
    normp_rmse, svd_rmse, knn_rmse, co_rmse, slope_rmse = [],[],[],[],[]

    for trainset, testset in kf.split(train_data):
        
        #baseline
        normp = NormalPredictor()
        normp.fit(trainset)
        normp_pred = normp.test(testset)
        normp_rmse.append(accuracy.rmse(normp_pred,verbose=False))
        
        #svd
        svd = SVD(n_factors=30, n_epochs=50,biased=True, lr_all=0.005, reg_all=0.4, verbose=False)
        svd.fit(trainset)
        svd_pred = svd.test(testset)
        svd_rmse.append(accuracy.rmse(svd_pred,verbose=False))
        
        #knn
        knn = KNNBasic(k=40,sim_options={'name': 'cosine', 'user_based': False}, verbose=False) 
        knn.fit(trainset)
        knn_pred = knn.test(testset)
        knn_rmse.append(accuracy.rmse(knn_pred,verbose=False))
        
        #co_clustering
        co = CoClustering(n_cltr_u=3,n_cltr_i=3,n_epochs=20)         
        co.fit(trainset)
        co_pred = co.test(testset)
        co_rmse.append(accuracy.rmse(co_pred,verbose=False))

    
    mean_rmses = [np.mean(normp_rmse),
                  np.mean(svd_rmse),
                  np.mean(knn_rmse),
                  np.mean(co_rmse)]
    
    model_names = ['baseline','svd','knn','coclustering']
    compare_df = pd.DataFrame(mean_rmses, columns=['RMSE'], index=model_names)
    
    return compare_df


In [101]:
comparison_df = model_framework(user_ratings_matrix)
comparison_df.head()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  co.fit(trainset)


Unnamed: 0,RMSE
baseline,0.078618
svd,0.064345
knn,0.065445
coclustering,0.069087


In [104]:
def gridsearch(data, model, param_grid):
    gs = GridSearchCV(model, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)

    new_params = gs.best_params['rmse']
    best_score = gs.best_score['rmse']

    print("Best score:", best_score)
    print("Best params:", new_params)
    
    return new_params, best_score

In [108]:
svd_param_grid = {'n_factors': [5, 10, 15, 20, 25, 30, 40, 50, 100],
                  'n_epochs': [5, 10, 15, 20, 25, 30, 40, 50],       
                  'lr_all': [0.002,0.005,0.01],
                  'reg_all':[0.02,0.1, 0.4]}

svd_params, svd_score = gridsearch(user_ratings_matrix, SVD, svd_param_grid)

Best score: 0.06328566100154029
Best params: {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}


In [111]:
svd_param_grid = {'n_factors': [1, 2, 3, 4, 5, 10, 15],
                  'n_epochs': [10, 15, 20, 25, 30],       
                  'lr_all': [0.001, 0.002,0.005, 0.01],
                  'reg_all':[0.02, 0.1, 0.4]}

svd_params, svd_score = gridsearch(user_ratings_matrix, SVD, svd_param_grid)

Best score: 0.06414152965422937
Best params: {'n_factors': 2, 'n_epochs': 20, 'lr_all': 0.001, 'reg_all': 0.4}


In [112]:
from surprise import KNNWithMeans

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs_KNW = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3, joblib_verbose=0)
gs_KNW.fit(user_ratings_matrix)

print("Best score:", gs_KNW.best_params['rmse'])
print("Best params:", gs_KNW.best_score['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi