In [99]:
# import modules
import numpy as np
import pandas as pd
import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
import time

In [4]:
# import dataset

restaurant_df = pd.read_csv('restaurant.csv')
rate_df = pd.read_csv('user_reviews.csv')

In [5]:
restaurant_df.head()

Unnamed: 0,restaurant_id,restaurant_name,neighborhood,review_count,price,type,average_rating
0,_IFMCrheTi12RqiR7jvJUg,Little Rey,Westside / Home Park,225,2.0,mexican,4.0
1,0qJXSTFBnNoh5lMD3LgE0Q,Bacchanalia,Westside / Home Park,660,4.0,american,4.5
2,0ukhgBeLATOo8b8lDobRGg,Roasters,Westside / Home Park,369,2.0,american,4.0
3,14nIlMhxTXWnJg0Glrr-PQ,Top Spice,Morningside / Lenox Park,471,2.0,asian,4.0
4,1i63faxXI1TQ7pNlLp3IPQ,Cypress Street Pint & Plate,Westside / Home Park,1244,2.0,american,4.0


In [6]:
rate_df.head(5)

Unnamed: 0,restaurant_name,restaurant_id,user_id,friends,number_reviews,photos,area_AL,elite_user,date,rating,rating_mean,standard_dev
0,Fox Bros. Bar-B-Q,u-4wti774tFcYRLuQrnHEg,__1kMkvHH-kWVeokwZSFXw,115,3,,1,0,3/18/2016,5,4.268636,
1,South City Kitchen Midtown,eG-UO83g_5zDk70FIJbm2w,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,4.336328,
2,The Vortex Bar And Grill - Midtown,Z2qMwUhnGt_2pA9uQbS7Uw,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,3.946784,
3,Fat Matt's Rib Shack,ALYQ-uM_uMkKbkXlhWcgbQ,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,5,4.178538,
4,Cypress Street Pint & Plate,1i63faxXI1TQ7pNlLp3IPQ,__48dJJcPvNgqUlEozwtpw,105,318,1219.0,0,0,8/23/2014,4,4.028112,


In [7]:
print('restaurant: ', restaurant_df.shape)
print('user_reviews: ', rate_df.shape)

restaurant:  (97, 7)
user_reviews:  (71432, 12)


In [8]:
# generate different size of datasets
rate_df_1 = rate_df[0:2000]
rate_df_2 = rate_df[0:20000]
rate_df_3 = rate_df[0:70000]

In [9]:
print('user_reviews_1: ', rate_df_1.shape)
print('user_reviews_2: ', rate_df_2.shape)
print('user_reviews_3: ', rate_df_3.shape)

user_reviews_1:  (2000, 12)
user_reviews_2:  (20000, 12)
user_reviews_3:  (70000, 12)


In [10]:
# data Analysis

num_of_rate_per_user = rate_df_3.groupby(by='user_id')['rating'].count().sort_values(ascending=True)
num_of_rate_per_user.head(10)

user_id
UvGT0JSSvPVMHQmIrx0E1w    1
cEFXYRhTsvfOu7s3K-GSoQ    1
cEHSnPusrGSR7uqks8q07Q    1
cEKG-JvH8BNJrncIHwAdCA    1
cEOOLhNWQ-2qySQZK06toQ    1
cEg5z0laRpLnn-0iEUHUUw    1
cEgtc2023TXJrfYv49OMXw    1
cEoUXy-o7d5idvmL_642Jg    1
cE0R1C-Aacg4R3gfxLy48Q    1
cEtOc0LaFcpjQ8a8yg-yeA    1
Name: rating, dtype: int64

In [11]:
full_df = rate_df_3[['user_id', 'restaurant_name', 'rating']]
full_df.head()

Unnamed: 0,user_id,restaurant_name,rating
0,__1kMkvHH-kWVeokwZSFXw,Fox Bros. Bar-B-Q,5
1,__48dJJcPvNgqUlEozwtpw,South City Kitchen Midtown,4
2,__48dJJcPvNgqUlEozwtpw,The Vortex Bar And Grill - Midtown,4
3,__48dJJcPvNgqUlEozwtpw,Fat Matt's Rib Shack,5
4,__48dJJcPvNgqUlEozwtpw,Cypress Street Pint & Plate,4


In [12]:
print(full_df.shape)

(70000, 3)


In [90]:
#Getting the new dataframe which contains users who has given 2 or more ratings

data=full_df.groupby("user_id").filter(lambda x:x['rating'].count() >=2)
data.sort_values(by='user_id', ascending=True)
data.head(10)

Unnamed: 0,user_id,restaurant_name,rating
1,__48dJJcPvNgqUlEozwtpw,South City Kitchen Midtown,4
2,__48dJJcPvNgqUlEozwtpw,The Vortex Bar And Grill - Midtown,4
3,__48dJJcPvNgqUlEozwtpw,Fat Matt's Rib Shack,5
4,__48dJJcPvNgqUlEozwtpw,Cypress Street Pint & Plate,4
8,__bMs0nf3_hnhitK91gT4A,South City Kitchen Midtown,4
9,__bMs0nf3_hnhitK91gT4A,Atlanta Breakfast Club,5
10,__bMs0nf3_hnhitK91gT4A,Herban Fix - Vegan Kitchen,2
18,_02XN3yATdWwfMIbsGhMuQ,Slutty Vegan,5
19,_02XN3yATdWwfMIbsGhMuQ,26 Thai Kitchen & Bar,4
20,_02XN3yATdWwfMIbsGhMuQ,Sweet Georgia's Juke Joint,2


In [91]:
print(data.shape)

(33201, 3)


In [92]:
df_user=rate_df_3[['user_id', 'number_reviews', 'elite_user', 'rating']].groupby("user_id").filter(lambda x:x['rating'].count() >=2)
df_user = df_user.drop_duplicates(subset='user_id', keep="first")

df_user.head(10)

Unnamed: 0,user_id,number_reviews,elite_user,rating
1,__48dJJcPvNgqUlEozwtpw,318,0,4
8,__bMs0nf3_hnhitK91gT4A,241,1,4
18,_02XN3yATdWwfMIbsGhMuQ,158,1,5
24,_0aVQ4FlWgbsYt5weko4RA,18,0,5
32,_0cWueK8TnVQgzDAHYzmqA,342,0,4
35,_0Fy-RA342PdGvLjsaPmtQ,148,0,3
39,_0p-IcWltylzqAH4CioExA,400,1,4
54,_1cGXPU3pUKKyjUEsU92qQ,17,0,5
62,_1wSwNKhS4WOYqOksOE1zQ,171,0,4
64,_1wTD-D_I-UjofcTHgANDA,168,0,4


In [93]:
df_user.shape

(10286, 4)

In [94]:
# Codes referenced "https://making.lyst.com/lightfm/docs/examples/dataset.html" &
# https://www.kaggle.com/niyamatalmass/lightfm-hybrid-recommendation-system/execution#LightFM-Python-Library"!!!

In [95]:
def generate_int_id(dataframe, id_col_name):
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})

In [96]:
# generating unique integer id for users and restaurants

restaurant_df = generate_int_id(restaurant_df, 'restaurant_int')
df_user = generate_int_id(df_user, 'user_int')
data = generate_int_id(data, 'restaurant_int')
data = generate_int_id(data, 'user_int')

In [97]:
restaurant_df.head()

Unnamed: 0,restaurant_id,restaurant_name,neighborhood,review_count,price,type,average_rating,restaurant_int,restaurant_int.1
0,_IFMCrheTi12RqiR7jvJUg,Little Rey,Westside / Home Park,225,2.0,mexican,4.0,0,0
1,0qJXSTFBnNoh5lMD3LgE0Q,Bacchanalia,Westside / Home Park,660,4.0,american,4.5,1,1
2,0ukhgBeLATOo8b8lDobRGg,Roasters,Westside / Home Park,369,2.0,american,4.0,2,2
3,14nIlMhxTXWnJg0Glrr-PQ,Top Spice,Morningside / Lenox Park,471,2.0,asian,4.0,3,3
4,1i63faxXI1TQ7pNlLp3IPQ,Cypress Street Pint & Plate,Westside / Home Park,1244,2.0,american,4.0,4,4


In [98]:
data.head()

Unnamed: 0,user_id,restaurant_name,rating,restaurant_int,user_int
0,__48dJJcPvNgqUlEozwtpw,South City Kitchen Midtown,4,0,0
1,__48dJJcPvNgqUlEozwtpw,The Vortex Bar And Grill - Midtown,4,1,1
2,__48dJJcPvNgqUlEozwtpw,Fat Matt's Rib Shack,5,2,2
3,__48dJJcPvNgqUlEozwtpw,Cypress Street Pint & Plate,4,3,3
4,__bMs0nf3_hnhitK91gT4A,South City Kitchen Midtown,4,4,4


In [25]:
from scipy.sparse import coo_matrix
import sys

u= data['user_int']
s= data['restaurant_int']
t= data['rating']

lu= u.nunique()
ls= s.nunique()

matrix= coo_matrix((t,(u,s)), shape=(lu, ls))
print(sys.getsizeof(matrix))

48


In [161]:
# Build lifhtFM collaborative Filtering model

In [114]:
# Default "Learning to Rank" Models with LightFM

model = LightFM(learning_rate=0.05, loss='warp')

In [115]:
from lightfm.cross_validation import random_train_test_split
train, test= random_train_test_split(matrix, test_percentage=0.2)

In [121]:
from lightfm.evaluation import auc_score, precision_at_k
start_time = time.time()
scores=[]
for e in range(10):
    model.fit_partial(train, epochs=10, num_threads=4)
    auc_train= auc_score(model, train, num_threads=4).mean()
    auc_test= auc_score(model, test, num_threads=4).mean()
    scores.append((auc_train, auc_test))
    
scores = np.array(scores)
print(time.time() - start_time)

586.130300283432


In [132]:
print(scores)

[[0.9999733  0.09988473]
 [0.9999754  0.0998848 ]
 [0.9999767  0.09988488]
 [0.99997765 0.09988491]
 [0.99997866 0.09988503]
 [0.9999801  0.09988509]
 [0.99998105 0.09988516]
 [0.99998176 0.09988522]
 [0.99998266 0.09988538]
 [0.999983   0.09988534]]


In [157]:
# prediction 

prediction= model.predict(np.array(data['user_int']), np.array(data['restaurant_int']))
preds= pd.DataFrame(zip(prediction, data['user_id'],data['restaurant_name'].tolist()), columns=['preds', 'user_id', 'restaurant_name'])
preds= preds.sort_values('preds', ascending= False)

In [158]:
#creating function to get top 5 Product Recommendation for each user.

preds

Unnamed: 0,preds,user_id,restaurant_name
31154,2.720504,xToLPRBZE9gSDS16Cf68FQ,Barcelona Inman Park
11127,2.685642,F1dzz6HTID6PLyaZeLD10Q,Antico Pizza
8401,2.679242,cjzfJV84KRuvWrht9oyNKQ,Ginya Izakaya
13113,2.677332,gvOCruiobHFJdF3rxTRgpg,Sweet Georgia's Juke Joint
26096,2.672404,SzQLyMhyfvVe2EwlHgA1pg,True Food Kitchen - Temporarily Closed
...,...,...,...
25306,-0.750487,SdLpyQrCi9uHBHAUukbmrQ,Two Urban Licks
5307,-0.751567,91TK74T3vZnIiL9GHY_QTQ,5Church Atlanta
19010,-0.756937,mfCtTH4SsOHEWNjWWKrgRw,Cypress Street Pint & Plate
21720,-0.772339,OvslvzLFz1boCXm2weA_Fw,Cafe Agora
