In [1]:
# import modules
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import SVD, SlopeOne, NMF, KNNWithMeans
from surprise.accuracy import rmse
from surprise import Dataset, accuracy
from surprise.model_selection import train_test_split
import os
import time
from collections import defaultdict

In [2]:
# import dataset

restaurant_df = pd.read_csv('restaurant.csv')
rate_df = pd.read_csv('user_reviews.csv')

In [3]:
restaurant_df.head(5)

Unnamed: 0,restaurant_id,restaurant_name,neighborhood,review_count,price,type,average_rating
0,_IFMCrheTi12RqiR7jvJUg,Little Rey,Westside / Home Park,225,2.0,mexican,4.0
1,0qJXSTFBnNoh5lMD3LgE0Q,Bacchanalia,Westside / Home Park,660,4.0,american,4.5
2,0ukhgBeLATOo8b8lDobRGg,Roasters,Westside / Home Park,369,2.0,american,4.0
3,14nIlMhxTXWnJg0Glrr-PQ,Top Spice,Morningside / Lenox Park,471,2.0,asian,4.0
4,1i63faxXI1TQ7pNlLp3IPQ,Cypress Street Pint & Plate,Westside / Home Park,1244,2.0,american,4.0


In [4]:
rate_df.head(5)

Unnamed: 0,restaurant_name,restaurant_id,user_id,friends,number_reviews,photos,area_AL,elite_user,date,rating,rating_mean,standard_dev
0,Fox Bros. Bar-B-Q,u-4wti774tFcYRLuQrnHEg,__1kMkvHH-kWVeokwZSFXw,115,3,,1,0,3/18/2016,5,4.268636,
1,South City Kitchen Midtown,eG-UO83g_5zDk70FIJbm2w,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,4.336328,
2,The Vortex Bar And Grill - Midtown,Z2qMwUhnGt_2pA9uQbS7Uw,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,3.946784,
3,Fat Matt's Rib Shack,ALYQ-uM_uMkKbkXlhWcgbQ,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,5,4.178538,
4,Cypress Street Pint & Plate,1i63faxXI1TQ7pNlLp3IPQ,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,4.028112,


In [5]:
print('restaurant: ', restaurant_df.shape)
print('user_reviews: ', rate_df.shape)

restaurant:  (97, 7)
user_reviews:  (71432, 12)


In [6]:
# generate different size of datasets
rate_df_1 = rate_df[0:2000]
rate_df_2 = rate_df[0:20000]
rate_df_3 = rate_df[0:70000]

In [7]:
print('user_reviews_1: ', rate_df_1.shape)
print('user_reviews_2: ', rate_df_2.shape)
print('user_reviews_3: ', rate_df_3.shape)

user_reviews_1:  (2000, 12)
user_reviews_2:  (20000, 12)
user_reviews_3:  (70000, 12)


In [8]:
# data merge

full_df = rate_df_3.merge(restaurant_df, how='left', left_on=['restaurant_id'], right_on=['restaurant_id'])
full_df.head()

Unnamed: 0,restaurant_name_x,restaurant_id,user_id,friends,number_reviews,photos,area_AL,elite_user,date,rating,rating_mean,standard_dev,restaurant_name_y,neighborhood,review_count,price,type,average_rating
0,Fox Bros. Bar-B-Q,u-4wti774tFcYRLuQrnHEg,__1kMkvHH-kWVeokwZSFXw,115,3,,1,0,3/18/2016,5,4.268636,,Fox Bros. Bar-B-Q,Poncey - Highland,3690,2.0,meat,4.5
1,South City Kitchen Midtown,eG-UO83g_5zDk70FIJbm2w,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,4.336328,,South City Kitchen Midtown,Westside / Home Park,2563,2.0,american,4.5
2,The Vortex Bar And Grill - Midtown,Z2qMwUhnGt_2pA9uQbS7Uw,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,3.946784,,The Vortex Bar And Grill - Midtown,Westside / Home Park,1710,2.0,fastfood,4.0
3,Fat Matt's Rib Shack,ALYQ-uM_uMkKbkXlhWcgbQ,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,5,4.178538,,Fat Matt's Rib Shack,Westside / Home Park,2107,2.0,meat,4.0
4,Cypress Street Pint & Plate,1i63faxXI1TQ7pNlLp3IPQ,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,4.028112,,Cypress Street Pint & Plate,Westside / Home Park,1244,2.0,american,4.0


In [9]:
# data Analysis

num_of_rate_per_user = full_df.groupby(by='user_id')['rating'].count().sort_values(ascending=True)
num_of_rate_per_user.head()

user_id
UvGT0JSSvPVMHQmIrx0E1w    1
cEFXYRhTsvfOu7s3K-GSoQ    1
cEHSnPusrGSR7uqks8q07Q    1
cEKG-JvH8BNJrncIHwAdCA    1
cEOOLhNWQ-2qySQZK06toQ    1
Name: rating, dtype: int64

In [10]:
full_df = full_df[['user_id', 'restaurant_name_x', 'rating']]
full_df.head()

Unnamed: 0,user_id,restaurant_name_x,rating
0,__1kMkvHH-kWVeokwZSFXw,Fox Bros. Bar-B-Q,5
1,__48dJJcPvNgqUlEozwtpw,South City Kitchen Midtown,4
2,__48dJJcPvNgqUlEozwtpw,The Vortex Bar And Grill - Midtown,4
3,__48dJJcPvNgqUlEozwtpw,Fat Matt's Rib Shack,5
4,__48dJJcPvNgqUlEozwtpw,Cypress Street Pint & Plate,4


In [11]:
print(full_df.shape)

(70000, 3)


In [12]:
#Getting the new dataframe which contains users who has given 2 or more ratings

data=full_df.groupby("user_id").filter(lambda x:x['rating'].count() >=2)
data.head(10)

Unnamed: 0,user_id,restaurant_name_x,rating
1,__48dJJcPvNgqUlEozwtpw,South City Kitchen Midtown,4
2,__48dJJcPvNgqUlEozwtpw,The Vortex Bar And Grill - Midtown,4
3,__48dJJcPvNgqUlEozwtpw,Fat Matt's Rib Shack,5
4,__48dJJcPvNgqUlEozwtpw,Cypress Street Pint & Plate,4
8,__bMs0nf3_hnhitK91gT4A,South City Kitchen Midtown,4
9,__bMs0nf3_hnhitK91gT4A,Atlanta Breakfast Club,5
10,__bMs0nf3_hnhitK91gT4A,Herban Fix - Vegan Kitchen,2
18,_02XN3yATdWwfMIbsGhMuQ,Slutty Vegan,5
19,_02XN3yATdWwfMIbsGhMuQ,26 Thai Kitchen & Bar,4
20,_02XN3yATdWwfMIbsGhMuQ,Sweet Georgia's Juke Joint,2


In [13]:
print(data.shape)

(33201, 3)


In [14]:
# Build collaborative filtering Models!
# Codes referenced "https://www.kaggle.com/malikasif123/amazon-reviews-recommendation-system"
# & "https://www.kaggle.com/podsyp/anime-recommendations-with-surprise"

In [15]:
#Reading the dataset using Surprise package for User Based Collaborative Filtering

reader = Reader(rating_scale=(1, 5))
data_reader_SVD = Dataset.load_from_df(data,reader)

In [16]:
# 5-fold cross validation

%time
algo = SVD(biased=False)
cross_validate(algo, data_reader_SVD, measures=['RMSE'], cv=5, verbose=False)

Wall time: 0 ns


{'test_rmse': array([1.81158657, 1.82081852, 1.83711494, 1.79867415, 1.82313441]),
 'fit_time': (1.9777064323425293,
  2.0355517864227295,
  2.2360143661499023,
  1.9657394886016846,
  1.9757134914398193),
 'test_time': (0.05385637283325195,
  0.05585074424743652,
  0.05186152458190918,
  0.07280540466308594,
  0.10471940040588379)}

In [17]:
trainset, testset = train_test_split(data_reader_SVD, test_size=0.25)

In [18]:
# Fitting and evaluating Collobarative filtering
start_time = time.time()
def accuracy_over_n_queries(n = 10):
    for i in range(n):
        predictions = algo.fit(trainset).test(testset)
        accuracy.rmse(predictions)  
    return predictions   
predictions = accuracy_over_n_queries(10)

print(time.time() - start_time)

RMSE: 1.9620
RMSE: 1.9246
RMSE: 1.8917
RMSE: 1.9242
RMSE: 1.8958
RMSE: 1.9022
RMSE: 1.8611
RMSE: 1.8584
RMSE: 1.8621
RMSE: 1.8894
22.22751522064209


In [115]:
#checking prediction

predictions[0]

Prediction(uid='dFKnDJNWx-B_E_ByZwa3jg', iid='Top Spice', r_ui=4.0, est=1, details={'was_impossible': False})

In [116]:
def get_Iu(uid):
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: 
        return 0
    
def get_Ui(iid):
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df_ = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_['Iu'] = df_.uid.apply(get_Iu)
df_['Ui'] = df_.iid.apply(get_Ui)
df_['err'] = abs(df_.est - df_.rui)
df_.sort_values(by='uid', ascending=True)
df_.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,dFKnDJNWx-B_E_ByZwa3jg,Top Spice,4.0,1.0,{'was_impossible': False},1,55,3.0
1,5NUQ5BKcRpFe4JhZ5Dm33w,Nuevo Laredo Cantina,5.0,1.114087,{'was_impossible': False},1,102,3.885913
2,BOh3_pP0Di-txZomArm4yg,True Food Kitchen - Temporarily Closed,5.0,2.089089,{'was_impossible': False},5,110,2.910911
3,9GVHc84vDE5s5JJ2kknhkA,Nuevo Laredo Cantina,3.0,1.464628,{'was_impossible': False},3,102,1.535372
4,CYqu48u1kHLtVcFtxn5Ctw,Der Biergarten,3.0,1.0,{'was_impossible': False},1,58,2.0


In [117]:
#creating function to get top 5 Product Recommendation for each user.

def get_top_n(predictions, n=5):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [118]:
top_n = get_top_n(predictions, n=3)

In [119]:
# Print the recommended items for first 50 user

count=0
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])
    if(count>49):
        break
    count=count+1

dFKnDJNWx-B_E_ByZwa3jg ['Top Spice']
5NUQ5BKcRpFe4JhZ5Dm33w ['Antico Pizza', 'Nuevo Laredo Cantina']
BOh3_pP0Di-txZomArm4yg ['26 Thai Kitchen & Bar', 'True Food Kitchen - Temporarily Closed']
9GVHc84vDE5s5JJ2kknhkA ['Nuevo Laredo Cantina', 'Twisted Soul Cookhouse & Pours']
CYqu48u1kHLtVcFtxn5Ctw ['Der Biergarten']
_pkwfwQx9--_sueH4yJ6hg ['Fox Bros. Bar-B-Q', 'Atlanta Breakfast Club']
7gbbdcrC9dUj87yH2UE7Gw ['Grub Burger Bar']
5P9PkVwCt3_dwgAKnNAu2A ['True Food Kitchen - Temporarily Closed']
azCJxAOCHtJXEwmiORKKew ['Antico Pizza']
8MlBmpm6S2nxeQ5eCgl3rA ['South City Kitchen Midtown', "Gus's World Famous Fried Chicken"]
d4MIsICnmeWPdlVOKf7svg ['Farm Burger', 'Fox Bros. Bar-B-Q', 'FLIP burger boutique']
cMpJ-KeY2l4D7ZiSEk_tjA ["Mary Mac's Tea Room", "Fat Matt's Rib Shack"]
1xrSKBsEsCPb94w4ZvjpbQ ['Purnima Bangladeshi Cuisine', 'True Food Kitchen - Temporarily Closed', 'Amalfi Pizza']
AYwIrU1-VKQ8cYnjm3W0LA ['The Optimist']
cLJJoqtSwCkFM6DW3XFAbg ['True Food Kitchen - Temporarily Closed']


In [120]:
import json

# writing
json.dump(top_n, open("rate_df_3.csv", 'w'))