# Yelp Data Challenge 

## Part IV - Restaurant Recommender

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [2]:
df = pd.read_csv('yelp_dataset/last_3_years_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-02-14,0,VETXTwMw6qxzOVDlXfe6Tg,5,went for dinner tonight. Amazing my husband ha...,0,ymlnR8UeFvB4FZL56tCZsA
1,6,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-12-04,0,S8-8uZ7fa5YbjnEtaW15ng,5,This was an amazing dinning experience! ORDER ...,0,9pSSL6X6lFpY3FCRLEH3og
2,14,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2016-08-22,1,1nK5w0VNfDlnR3bOz13dJQ,5,My husband and I went there for lunch on a Sat...,1,gm8nNoA3uB4In5o_Hxpq3g
3,15,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2016-09-13,0,N1Z93BthdJ7FT2p5S22jIA,3,Went for a nice anniversary dinner. Researched...,0,CEtidlXNyQzgJSdF1ubPFw
4,27,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2016-08-08,0,ir-EVhHyWna7KqYWtj660g,5,Hands down the best meal and service I have ev...,0,9_BhDyzJYf2JwTD9TyXJ4g


## 1. Clean data and get rating data 

#### Select relevant columns in the original dataframe

In [4]:
# Get business_id, user_id, stars for recommender
columns_selected = [u'business_id',u'user_id',u'stars']
df_recommend = df[columns_selected]

In [5]:
df_recommend.head(10)

Unnamed: 0,business_id,user_id,stars
0,--9e1ONYQuAa-CB_Rrw7Tw,ymlnR8UeFvB4FZL56tCZsA,5
1,--9e1ONYQuAa-CB_Rrw7Tw,9pSSL6X6lFpY3FCRLEH3og,5
2,--9e1ONYQuAa-CB_Rrw7Tw,gm8nNoA3uB4In5o_Hxpq3g,5
3,--9e1ONYQuAa-CB_Rrw7Tw,CEtidlXNyQzgJSdF1ubPFw,3
4,--9e1ONYQuAa-CB_Rrw7Tw,9_BhDyzJYf2JwTD9TyXJ4g,5
5,--9e1ONYQuAa-CB_Rrw7Tw,cZVQGCZ_fHtTdfiyGVJPdg,5
6,--9e1ONYQuAa-CB_Rrw7Tw,rhP3yu4Sxpj9OXgeY0VpLg,5
7,--9e1ONYQuAa-CB_Rrw7Tw,AZcrPTUbzOEvy-gr-2jZCw,5
8,--9e1ONYQuAa-CB_Rrw7Tw,li2cBZl60vgqihDJJG7jeA,2
9,--9e1ONYQuAa-CB_Rrw7Tw,GQWk8vgYGlN9hp0XP0V05w,5


In [6]:
df_recommend['stars'].value_counts()

5    269295
4    114025
1     71878
3     59248
2     44441
Name: stars, dtype: int64

In [7]:
df_recommend['business_id'].nunique()

4929

In [9]:
user_reviews = df_recommend.groupby('user_id').business_id.nunique()

In [10]:
user_reviews.describe()

count    249410.000000
mean          2.240836
std           4.584291
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         652.000000
Name: business_id, dtype: float64

In [11]:
user_reviews.quantile(q=0.95)

6.0

#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

**Q**: How do we recommend to these users anyways?

In [42]:
# To be implemented
df_copy = df.groupby(['user_id']).size().reset_index(name='counts')

In [52]:
counts_condition = df_copy['counts'] >= 66
select_user_id = set(df_copy[counts_condition]['user_id'].values)

In [53]:
def keep(row):
    if row['user_id'] in select_user_id:
        val = 1
    else:
        val = 0
    return val
df['keep'] = df.apply(keep,axis = 1)

In [54]:
condition = df['keep'] == 1
df_final = df[condition]

In [55]:
df_final.shape

(18765, 14)

In [56]:
df.shape

(558887, 14)

In [57]:
df_final.reset_index()

Unnamed: 0.1,index,Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id,keep
0,67,321,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,1,2017-07-09,0,bPV5ArkAOS106QrvRZlb9A,4,Went for restaurant week lunch. Decent cocktai...,0,_7PfR6Tvh2xTbiVi1GELoQ,1
1,96,417,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,37,2018-02-21,23,BvPfUYKwBC-pe2gLGEll0w,4,Decided to celebrate V Day here as we haven't ...,40,LNPyE5X1K5B_flcTCH9StQ,1
2,221,788,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,1,2017-07-02,2,jF5o9G-bSLpy3W-tJmFfRw,5,I think this place can serve bologna sandwiche...,1,D3fOi-5w4intrUTWXzIQew,1
3,228,813,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-10-10,0,bgKXpVT25v8sNQqtVdYWGg,5,"Fantastic food. Great ambiance, service, and ...",2,JaqcCU3nxReTW2cBLHounA,1
4,254,880,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2018-06-30,0,IYhkUW6YOr3kB2Aky06_uw,5,Didn't expect to enjoy dining here as much as ...,0,NrpzAH3EoNhWUR8OysUhBQ,1
5,292,999,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2018-03-06,0,8uDohBge3oaAE68_5RiEUw,5,"Had a corporate lunch here, service and food a...",3,8e2KqFt1W67MA3ah09IYbw,1
6,323,1092,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,1,2015-11-30,1,eA3hnRdUFMkA5UJyIMJd0Q,4,My husband and I ate here for Thanksgiving din...,1,3nIuSCZk5f_2WWYMLN7h3w,1
7,403,1341,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,5,2017-04-23,3,2Sk-2jlmv0jj4RjGiGPbEw,4,Ask ten local 'foodies' to name their favorite...,5,U4INQZOPSUaj8hMjLlZ3KA,1
8,457,1509,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,6,2017-06-29,3,ezRKl2jzWD2bViAR8TjcLw,5,Back to Delmonico three more times for great L...,7,s2o_JsABvrZVm_T03qrBUw,1
9,575,1665,-1m9o3vGRA8IBPNvNqKLmA,Bavette's Steakhouse & Bar,"African, Restaurants, Bars, American (New), Co...",4.5,2,2018-01-14,1,9PBqbv620RoofjTF0mfrDA,3,Excited to go to the new bavette at the Montec...,4,8e2KqFt1W67MA3ah09IYbw,1


#### Create utility matrix from records

In [19]:
# To be implemented
df_utility = pd.pivot_table(data=df_final, 
                            values='stars', 
                            index='user_id', 
                            columns='business_id', 
                            fill_value=0)

In [20]:
df_utility.shape

(193, 3297)

In [21]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1m9o3vGRA8IBPNvNqKLmA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-Bf8BQ3yMk8U2f45r2DRKw,-BmqghX1sv7sgsxOIS2yAg,-Bv-HHUs8aHzDrdWcZHn8w,-C8sSrFqaCxp51pyo-fQLQ,...,zjvnqTjBp56NhMp1GrlO5g,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zr42_UsWfaIF-rcp37OpwA,zsQk990PubOHjr1YcLkQFw,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg,zx_j6OuuHHa2afVoAZuLpA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-EJorVxe7h2GSxdiRyMmDA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
-xDW3gYiYaoeVASXywTPgw,0,0,0,0,0,4,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
06o1DmiBoiyxI2q3v2QRbg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0Md3sO3sRR3GE4eQLtqZbw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0hZfE2He1YssM_wEvKhfcA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df_mat = df_utility.as_matrix()

In [23]:
df_mat.shape,df_utility.shape

((193, 3297), (193, 3297))

In [24]:
from scipy import sparse
ratings_mat = sparse.csr_matrix(df_mat)

In [25]:
ratings_mat

<193x3297 sparse matrix of type '<type 'numpy.int64'>'
	with 18765 stored elements in Compressed Sparse Row format>

## 2. Item-Item similarity recommender

### Let's reuse the ItemItemRecommender class derived from previous exercise

Hint: we need to make modification to accommodate the dense numpy array

In [26]:
# To be implemented
utility_mat = ratings_mat

In [28]:
# Item-Item Similarity Matrix
from sklearn.metrics.pairwise import cosine_similarity
item_sim_mat = cosine_similarity(utility_mat.T)

In [29]:
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [30]:
neighborhoods.shape

(3297, 75)

In [60]:
# Make a prediction for one lucky user
# Let's pick a lucky user
user_id = 99

In [61]:
from time import time
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]

start_time = time()
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    out[item_to_rate] = ratings_mat[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print(pred_ratings)
print("Execution time: %f seconds" % (time()-start_time))

  del sys.path[0]


[4.         4.29995501 4.         ... 4.27821747 4.30668705 4.65757359]
Execution time: 1.766231 seconds


In [62]:
# Get final recommendation
# Recommend n choices
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

# Find items that have been rated by user
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

[1608, 213, 2749, 2805, 2645, 2472, 870, 1704, 1187, 686]

In [63]:
business_id_lst = df_final['business_id'].unique()
business_id_dict = {k: v for v, k in enumerate(business_id_lst)}
df_final['business_id_map'] = df_final.apply(lambda x: business_id_dict[x['business_id']], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [64]:
business_id_map_top_n = unrated_items_by_pred_rating[:n]
rec_business_name_lst = []
for id_map in business_id_map_top_n:
    #print id_map
    rec_business_name_lst.append(df_final[df_final['business_id_map'] == id_map]['name'].unique()[0])

rec_business_name_lst

['Aloha Kitchen',
 "Pt's Gold",
 "Freed's Bakery",
 'Tiabi Coffee & Waffle',
 'The Westin Las Vegas Hotel & Spa',
 'Burger King',
 "Rocco's NY Pizzeria",
 "Denny's",
 'Yui Edomae Sushi',
 "Tony Roma's"]

## 3. Matrix Factorization recommender

##### Compare two of the methods demoed in Practice Class: sklearn NMF, sklearn TruncatedSVD, or GraphLab
##### *Extra points for using GraphLab

##### NMF

In [74]:
from sklearn.decomposition import NMF

def fit_nmf(M,k):
    nmf = NMF(n_components=k)
    nmf.fit(M)
    W = nmf.transform(M);
    H = nmf.components_;
    err = nmf.reconstruction_err_
    return W,H,err

# decompose
W,H,err = fit_nmf(ratings_mat,169)
print(err)
print(W.shape,H.shape)

136.2899470356833
((193, 169), (169, 3297))


In [75]:
# reconstruct
ratings_mat_fitted = W.dot(H)
errs = np.array((ratings_mat-ratings_mat_fitted).flatten()).squeeze()
mask = np.array((ratings_mat.todense()).flatten()).squeeze()>0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()
print(mse)
print(average_abs_err)

0.7096680222896242
0.30028216737130964


In [76]:
# get recommendations for one user
user_id = 99
n = 10

pred_ratings = ratings_mat_fitted[user_id,:]
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

[193, 1209, 1713, 623, 499, 139, 2235, 1086, 1574, 3245]

In [77]:
business_id_map_top_n_NVM = unrated_items_by_pred_rating[:n]
rec_business_name_lst_NVM = []
for id_map in business_id_map_top_n_NVM:
    #print id_map
    rec_business_name_lst_NVM.append(df_final[df_final['business_id_map'] == id_map]['name'].unique()[0])

rec_business_name_lst_NVM

['Pressed For Juice',
 'Good Fella Korean Bistro',
 'Sambalatte',
 'Veggie House',
 'Outback Steakhouse',
 "Braddah's Island Style",
 'Soho Sushi Burrito',
 "Glazier's Food Marketplace",
 'Greens and Proteins',
 'Metro Diner']

In [78]:
### check errors
# truth
ratings_true = ratings_mat[user_id, items_rated_by_this_user].todense()
# prediction
ratings_pred = pred_ratings[items_rated_by_this_user]
print(list(zip(np.array(ratings_true).squeeze(),ratings_pred)))
err_one_user = ratings_true-ratings_pred
print(err_one_user)
print(abs(err_one_user).mean())

[(4, 3.99406189862474), (5, 4.994557445513163), (4, 3.9968363068251307), (4, 3.996836306825182), (5, 4.975136334594472), (4, 3.99683630682502), (5, 5.012164626444427), (4, 3.9922907447975957), (5, 4.98334402952425), (4, 3.981965302646765), (5, 4.976082771446907), (4, 4.001944425015252), (3, 3.0411281838214106), (4, 4.001112873976805), (4, 3.9800672447100918), (4, 3.9966675419981845), (5, 4.996045383531561), (4, 4.087913960247911), (5, 4.990436824062891), (5, 4.989916111026723), (5, 4.972342595652921), (5, 4.975596086131759), (4, 3.9962146673043417), (5, 4.988973737465841), (5, 4.943141315388916), (4, 3.9941216783716698), (4, 3.98756412191207), (5, 5.01268421818481), (4, 4.022031963769828), (4, 3.989854298448471), (3, 2.9923629268605123), (4, 3.9968363068248403), (4, 3.9968363069961734), (5, 4.992330012917099), (5, 4.99137848518365), (4, 3.9749739814754443), (4, 3.957697025156115), (5, 4.994674377472508), (5, 4.979368889855697), (5, 4.995363639018239), (5, 5.104533578074602), (5, 4.9689

#####  UVD/SVD

In [91]:
from sklearn.decomposition import TruncatedSVD

def fit_uvd(M,k):
    # use TruncatedSVD to realize UVD
    svd = TruncatedSVD(n_components=k, n_iter=7, random_state=0)
    svd.fit(M)

    V = svd.components_
    U = svd.transform(M) # effectively, it's doing: U = M.dot(V.T)
    # we can ignore svd.singular_values_ for our purpose
    
    # why we can do this?
    # recall: 
    # SVD start from u*s*v=M => u*s=M*v.T, where M*v.T is our transformation above to get U in UVD
    # so the above U is effectively u*s in SVD
    # that's why U*V = u*s*v = M our original matrix
    # there are many ways to understand it!
    # here we by-passed singular values.
    
    return U,V, svd

# decompose
U,V,svd = fit_uvd(ratings_mat,169)

In [92]:
print(U.shape,V.shape)

((193, 169), (169, 3297))


In [93]:
# reconstruct
ratings_mat_fitted = U.dot(V) # U*V


# recall: U = M.dot(V.T), then this is M.dot(V.T).dot(V)
# original M is transformed to new space, then transformed back
# this is another way to understand it!

# calculate errs
errs = np.array((ratings_mat-ratings_mat_fitted).flatten()).squeeze()
mask = np.array((ratings_mat.todense()).flatten()).squeeze()>0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()
print(mse)
print(average_abs_err)

0.157601336155406
0.2263427952674526


In [94]:
# compare with another way to reconstruct matrix
# with the above "tranformed to the new space and back" language
# without the UV language, we can do:

# reconstruct M with inverse_transform
ratings_mat_fitted_2 = svd.inverse_transform(svd.transform(ratings_mat))
ratings_mat_fitted = U.dot(V)
print(sum(sum(ratings_mat_fitted - ratings_mat_fitted_2)))
# they are just equivalent!!

0.0


In [95]:
# get recommendations for one user
user_id = 100
n = 10

pred_ratings = ratings_mat_fitted[user_id,:]
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

[1488, 2373, 1273, 1392, 1405, 2364, 2626, 2371, 2598, 324]

In [96]:
business_id_map_top_n_SVD = unrated_items_by_pred_rating[:n]
rec_business_name_lst_SVD = []
for id_map in business_id_map_top_n_SVD:
    #print id_map
    rec_business_name_lst_SVD.append(df_final[df_final['business_id_map'] == id_map]['name'].unique()[0])

rec_business_name_lst_SVD

['Neighbors',
 'Texas de Brazil',
 'Me Gusta Tacos',
 'Yum Cha',
 'Chabuya',
 'JINYA Ramen Bar',
 'Mr Mamas',
 'Thai BBQ',
 'Eat.',
 'Bacon Bar']

In [97]:
### check errors
# truth
ratings_true = ratings_mat[user_id, items_rated_by_this_user].todense()
# prediction
ratings_pred = pred_ratings[items_rated_by_this_user]
print(list(zip(np.array(ratings_true).squeeze(),ratings_pred)))
err_one_user = ratings_true-ratings_pred
print(err_one_user)
print(abs(err_one_user).mean())

[(4, 3.780028587361997), (4, 3.8459564222324993), (4, 3.8976853613597315), (3, 2.838685183418121), (2, 2.013675638151647), (5, 4.776765160733703), (3, 2.989826726142336), (5, 4.774054398161947), (5, 4.786973320519841), (5, 4.876130856191551), (4, 3.9730113308835366), (3, 2.882251064936978), (5, 5.2040103566734075), (4, 4.321589764319972), (5, 4.802862395957291), (1, 0.991244825652536), (3, 2.944641985162512), (2, 1.9926633145518273), (3, 2.822081595912416), (5, 4.8167994466281545), (4, 3.8739164792669243), (4, 4.030123604661083), (4, 3.9366995751423617), (5, 4.855615340602418), (4, 3.9791500881132875), (1, 1.0037749644870053), (2, 2.0192903781805676), (4, 3.8260962086866352), (1, 0.9745363322762598), (5, 4.616359586552055), (3, 3.067805237894542), (3, 3.0083299422111613), (5, 5.125342421911702), (3, 2.909394079049325), (5, 5.092192533583065), (4, 3.9703605175658554), (5, 4.961595132196953), (3, 3.0984148353204946), (4, 3.8718675221238352), (4, 3.9133526112136656), (4, 3.818067129715308