In [1]:
import os
import pandas as pd
from surprise import NormalPredictor, Reader, Dataset, accuracy, SVD, KNNBasic, KNNWithMeans, CoClustering, dump
from surprise.model_selection import cross_validate, KFold, GridSearchCV, train_test_split
import numpy as np

# Data

In [2]:
df = pd.read_csv("transaction")
df.head()

Unnamed: 0.1,Unnamed: 0,id,user_id,restaurant_id,created_at
0,8246,1,3FC1CB1AEED7BE57,4F45F749D78D0EBE,2022-06-24 00:00:14.168
1,15705,2,EB3C5BB1F0E1D977,1A811BB0FAE6DA51,2022-06-24 00:00:14.668
2,22017,3,B174219506B1BF3A,49043A7EFB6BB580,2022-06-24 00:00:33.152
3,15615,4,FD33853938DD7ECE,2CBBCC8BCD69688B,2022-06-24 00:00:50.873
4,7070,5,D6AD85685EEED5AC,0445647F26E9DDFD,2022-06-24 00:00:53.170


In [3]:
print('shape:', df.shape)
print('unique user_id:', len(pd.unique(df['user_id'])))
print('unique restaurant_id:', len(pd.unique(df['restaurant_id'])))
print(df['restaurant_id'].value_counts())

shape: (30000, 5)
unique user_id: 28437
unique restaurant_id: 12598
A7FD8C847677D6B0    59
65A73F3D990FF244    54
B36C9625A8B576B0    47
53054BDA3701C783    43
0DD4153705CBC8DB    42
                    ..
D2F8D0767C4A84EE     1
B2F3FA9053256E6C     1
EC5588EB9FFA4C79     1
647D587400A9BF55     1
7F30B27851B47722     1
Name: restaurant_id, Length: 12598, dtype: int64


In [4]:
df = df[['user_id', 'restaurant_id']]
print(df.shape)
df.head()

(30000, 2)


Unnamed: 0,user_id,restaurant_id
0,3FC1CB1AEED7BE57,4F45F749D78D0EBE
1,EB3C5BB1F0E1D977,1A811BB0FAE6DA51
2,B174219506B1BF3A,49043A7EFB6BB580
3,FD33853938DD7ECE,2CBBCC8BCD69688B
4,D6AD85685EEED5AC,0445647F26E9DDFD


In [5]:
df_frequency = df.groupby(df.columns.tolist(), as_index=False).size()
print(df_frequency['size'].sum())
df_frequency.sort_values(by=['size'], ascending=False)

30000


Unnamed: 0,user_id,restaurant_id,size
25138,D77101ABBD55A46E,1231622E0D4745F7,4
20410,AF1859F2CB3D57D8,7D952972CD43E190,3
19804,AA288DC7E3C46779,A7FD8C847677D6B0,3
3163,1B26A8D907C9C331,D3E7D5ADD2CDD274,3
13668,75860D1893D29DFC,EFAB3E40F4EF201E,3
...,...,...,...
9974,554C3C0195523F9F,0E19B5CAA99B0A6A,1
9973,554C02BCCA408E30,9046073717134017,1
9972,554A852A0EFF95F2,D30CEC8BA2D99FC7,1
9971,5546D6851F843251,3F161CC8E74671A7,1


In [6]:
df_max_scaled = df_frequency.copy()

column = 'size'
df_max_scaled[column] = df_max_scaled[column] /df_max_scaled[column].abs().max()

display(df_max_scaled)

Unnamed: 0,user_id,restaurant_id,size
0,0000475F1EFEF93F,30FF5B178F133C2E,0.25
1,000155B04C8D8CC8,1684A4CC1DCAEBE6,0.25
2,0003E1FDB847FCE8,4F202F894DEF5D45,0.25
3,0003E1FDB847FCE8,E24561053B2E55EF,0.25
4,0005C2C51293545B,63FFF7BB8DDD856D,0.25
...,...,...,...
29884,FFF526DEA3D970F6,821C587D566DBD5F,0.25
29885,FFF90B1B03567F23,EAD3408E1A579D53,0.25
29886,FFFB8AD268144C72,569DB6764A5181A4,0.25
29887,FFFB8AD268144C72,A660BB1CC57C3D28,0.25


In [7]:
def create_user_ratings_df(data):
    df = data.groupby(data.columns.tolist(), as_index=False).size().reset_index()
    
    df = df.rename({'size':'freq'}, axis=1)
    return df

def surprise_df(data):
    scale = (data.freq.min(), data.freq.max())
    reader = Reader(rating_scale=scale)

    df = Dataset.load_from_df(data[['user_id',
                                    'restaurant_id',
                                    'freq']], reader)
    
    return df


In [8]:
user_ratings_df = create_user_ratings_df(df)
user_ratings_df.sort_values(by=['freq'], ascending=False)
# user_ratings_df.head()

Unnamed: 0,index,user_id,restaurant_id,freq
25138,25138,D77101ABBD55A46E,1231622E0D4745F7,4
20410,20410,AF1859F2CB3D57D8,7D952972CD43E190,3
19804,19804,AA288DC7E3C46779,A7FD8C847677D6B0,3
3163,3163,1B26A8D907C9C331,D3E7D5ADD2CDD274,3
13668,13668,75860D1893D29DFC,EFAB3E40F4EF201E,3
...,...,...,...,...
9974,9974,554C3C0195523F9F,0E19B5CAA99B0A6A,1
9973,9973,554C02BCCA408E30,9046073717134017,1
9972,9972,554A852A0EFF95F2,D30CEC8BA2D99FC7,1
9971,9971,5546D6851F843251,3F161CC8E74671A7,1


In [9]:
user_ratings_matrix = surprise_df(user_ratings_df)

# Model

In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=19)
def model_framework(train_data):
    #store the rmse values for each fold in the k-fold loop 
    normp_rmse, svd_rmse, knn_rmse, co_rmse, slope_rmse = [],[],[],[],[]

    for trainset, testset in kf.split(train_data):
        
        #baseline
        normp = NormalPredictor()
        normp.fit(trainset)
        normp_pred = normp.test(testset)
        normp_rmse.append(accuracy.rmse(normp_pred,verbose=False))
        
        #svd
        svd = SVD(n_factors=30, n_epochs=50,biased=True, lr_all=0.005, reg_all=0.4, verbose=False)
        svd.fit(trainset)
        svd_pred = svd.test(testset)
        svd_rmse.append(accuracy.rmse(svd_pred,verbose=False))
        
        #knn
        knn = KNNBasic(k=40,sim_options={'name': 'cosine', 'user_based': False}, verbose=False) 
        knn.fit(trainset)
        knn_pred = knn.test(testset)
        knn_rmse.append(accuracy.rmse(knn_pred,verbose=False))
        
        #co_clustering
        co = CoClustering(n_cltr_u=3,n_cltr_i=3,n_epochs=20)         
        co.fit(trainset)
        co_pred = co.test(testset)
        co_rmse.append(accuracy.rmse(co_pred,verbose=False))

    
    mean_rmses = [np.mean(normp_rmse),
                  np.mean(svd_rmse),
                  np.mean(knn_rmse),
                  np.mean(co_rmse)]
    
    model_names = ['baseline','svd','knn','coclustering']
    compare_df = pd.DataFrame(mean_rmses, columns=['RMSE'], index=model_names)
    
    return compare_df


In [11]:
comparison_df = model_framework(user_ratings_matrix)
comparison_df.head()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  co.fit(trainset)


Unnamed: 0,RMSE
baseline,0.078586
svd,0.064294
knn,0.065445
coclustering,0.06844


In [12]:
def gridsearch(data, model, param_grid):
    gs = GridSearchCV(model, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)

    new_params = gs.best_params['rmse']
    best_score = gs.best_score['rmse']

    print("Best score:", best_score)
    print("Best params:", new_params)
    
    return new_params, best_score

In [13]:
svd_param_grid = {'n_factors': [5, 10, 15, 20, 25, 30, 40, 50, 100],
                  'n_epochs': [5, 10, 15, 20, 25, 30, 40, 50],       
                  'lr_all': [0.002, 0.005, 0.01],
                  'reg_all':[0.02, 0.1, 0.4]}

svd_params, svd_score = gridsearch(user_ratings_matrix, SVD, svd_param_grid)

Best score: 0.06356661891897102
Best params: {'n_factors': 5, 'n_epochs': 25, 'lr_all': 0.002, 'reg_all': 0.1}


In [40]:
svd_param_grid = {'n_factors': [5],
                    'n_epochs': [5],       
                    'lr_all': [0.001],
                    'reg_all':[0.02]
                }

svd_params, svd_score = gridsearch(user_ratings_matrix, SVD, svd_param_grid)

Best score: 0.0637888102385935
Best params: {'n_factors': 5, 'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.02}


In [27]:
svd_param_grid = {'n_factors': [5, 15, 25, 35, 50],
                  'n_epochs': [5, 15, 25, 35, 50],       
                  'lr_all': [0.001, 0.002, 0.005, 0.01],
                  'reg_all':[0.02, 0.1, 0.4]}

svd_params, svd_score = gridsearch(user_ratings_matrix, SVD, svd_param_grid)

Best score: 0.06372009709708035
Best params: {'n_factors': 5, 'n_epochs': 25, 'lr_all': 0.001, 'reg_all': 0.1}


In [None]:
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs_KNW = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3, joblib_verbose=0)
gs_KNW.fit(user_ratings_matrix)

print("Best score:", gs_KNW.best_params['rmse'])
print("Best params:", gs_KNW.best_score['rmse'])

# Final Model

In [16]:
def final_model_test(params, train_set, test_set):
    svd = SVD(n_factors=params['n_factors'], 
                    n_epochs=params['n_epochs'],
                    lr_all=params['lr_all'], 
                    reg_all=params['reg_all'])

    svd.fit(train_set)
    
    predictions = svd.test(test_set)
    rmse = accuracy.rmse(predictions,verbose=False)
            
    return predictions, rmse

def final_model(params, train_set):
    svd = SVD(n_factors=params['n_factors'], 
                    n_epochs=params['n_epochs'],
                    lr_all=params['lr_all'], 
                    reg_all=params['reg_all'])

    svd.fit(train_set)

    return svd

In [17]:
params = {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}
train_set, test_set = train_test_split(user_ratings_matrix, test_size=0.2, random_state=19)
final_predictions, model_rmse = final_model_test(params, train_set, test_set)

In [34]:
print(model_rmse)

0.06314764616936226


In [18]:
results = pd.DataFrame(final_predictions, columns=['user_id', 'restaurant_id', 'freq', 'model_est', 'details'])     
results['err'] = abs(results.model_est - results.freq)
results.head()

Unnamed: 0,user_id,restaurant_id,freq,model_est,details,err
0,3CAB8B730E5CB641,7EA8E2E3759C0A15,1.0,1.003722,{'was_impossible': False},0.003722
1,3D5113549F2CDE7A,086011E5A4B48367,1.0,1.004437,{'was_impossible': False},0.004437
2,6EA8ABCB3252F1B1,1FBCF633F42E994C,1.0,1.001672,{'was_impossible': False},0.001672
3,9832DB619D94B783,7D7F5DA24EDA91B4,1.0,1.004236,{'was_impossible': False},0.004236
4,18CF6C59A63F654B,D94BC121EE0709F0,1.0,1.003168,{'was_impossible': False},0.003168


In [19]:
params = {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.1}
trainset = user_ratings_matrix.build_full_trainset()
final_model = final_model(params, trainset)

# Recommend 

In [20]:
predict_result = final_model.predict('554C3C0195523F9F', 'A3951E5FC2AC4A64')
print(predict_result)

user: 554C3C0195523F9F item: A3951E5FC2AC4A64 r_ui = None   est = 1.03   {'was_impossible': False}


In [21]:
res_id_list = pd.unique(df['restaurant_id'])
print(len(res_id_list))
print(res_id_list[:3])

12598
['4F45F749D78D0EBE' '1A811BB0FAE6DA51' '49043A7EFB6BB580']


In [22]:
user_interest = dict()
for res_id in res_id_list:
    predict_result = final_model.predict('554C3C0195523F9F', res_id)
    user_interest[res_id] = predict_result

In [23]:
import pprint
pprint.pprint(user_interest)

{'0000478C5135FB88': Prediction(uid='554C3C0195523F9F', iid='0000478C5135FB88', r_ui=None, est=1.0285707918390603, details={'was_impossible': False}),
 '00019D3A753C584F': Prediction(uid='554C3C0195523F9F', iid='00019D3A753C584F', r_ui=None, est=1, details={'was_impossible': False}),
 '00056BF323DDB5B7': Prediction(uid='554C3C0195523F9F', iid='00056BF323DDB5B7', r_ui=None, est=1.0186934433988308, details={'was_impossible': False}),
 '0009900BA071F707': Prediction(uid='554C3C0195523F9F', iid='0009900BA071F707', r_ui=None, est=1, details={'was_impossible': False}),
 '000A13855F4BC304': Prediction(uid='554C3C0195523F9F', iid='000A13855F4BC304', r_ui=None, est=1.0111112739770138, details={'was_impossible': False}),
 '000CAFE43368EAA0': Prediction(uid='554C3C0195523F9F', iid='000CAFE43368EAA0', r_ui=None, est=1.0123474787230855, details={'was_impossible': False}),
 '001313DFD58862A0': Prediction(uid='554C3C0195523F9F', iid='001313DFD58862A0', r_ui=None, est=1, details={'was_impossible': Fal

In [24]:
saved_model_name = 'model.pickle'
dump.dump(saved_model_name, None, final_model)

In [25]:
pred, loaded_model = dump.load(saved_model_name)

user_interest = dict()
for res_id in res_id_list:
    predict_result = loaded_model.predict('554C3C0195523F9F', res_id)
    user_interest[res_id] = predict_result

pprint.pprint(user_interest)

{'0000478C5135FB88': Prediction(uid='554C3C0195523F9F', iid='0000478C5135FB88', r_ui=None, est=1.0285707918390603, details={'was_impossible': False}),
 '00019D3A753C584F': Prediction(uid='554C3C0195523F9F', iid='00019D3A753C584F', r_ui=None, est=1, details={'was_impossible': False}),
 '00056BF323DDB5B7': Prediction(uid='554C3C0195523F9F', iid='00056BF323DDB5B7', r_ui=None, est=1.0186934433988308, details={'was_impossible': False}),
 '0009900BA071F707': Prediction(uid='554C3C0195523F9F', iid='0009900BA071F707', r_ui=None, est=1, details={'was_impossible': False}),
 '000A13855F4BC304': Prediction(uid='554C3C0195523F9F', iid='000A13855F4BC304', r_ui=None, est=1.0111112739770138, details={'was_impossible': False}),
 '000CAFE43368EAA0': Prediction(uid='554C3C0195523F9F', iid='000CAFE43368EAA0', r_ui=None, est=1.0123474787230855, details={'was_impossible': False}),
 '001313DFD58862A0': Prediction(uid='554C3C0195523F9F', iid='001313DFD58862A0', r_ui=None, est=1, details={'was_impossible': Fal