For Yelpers living in Toronto, which restaurants do you recommend them, when they travel to Las Vegas, US? We want a recommendation model, which can recommend restaurant for travelers, based on their previous actions in Yelp platform.

# Set Up

In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import pprint
import geopy.distance

client = MongoClient()
dblist = client.list_database_names()
db = client.yelp
pp = pprint.PrettyPrinter(indent=4)

# Find Reviews for Restaurants in Toronto

## Find all the "business_id"s for Toronto Restaurants

In [2]:
toronto = {"city":"Toronto", "state":"ON"}
# proj = {"business_id":1, "review_count": 1, "stars":1, "_id":0}
proj = {"business_id":1, "_id":0}
toronto_biz_cursor = db.business.find(toronto, proj)


In [3]:
df_toronto_biz = pd.DataFrame(list(toronto_biz_cursor))
# df_toronto_biz.set_index('business_id', inplace=True)


In [4]:
df_toronto_biz.head()

Unnamed: 0,business_id
0,zSpQmEBvRe3IhTUlMSA6HQ
1,C9oCPomVP0mtKa8z99E3gg
2,C9keC4mWuXdl2mYFHZXudQ
3,NDuUMJfrWk52RA-H-OtrpA
4,SP_YXIEwkFPPl_9anCYmpQ


In [5]:
set_toronto_biz_id = set(df_toronto_biz['business_id'])
set_toronto_biz_id

{'Xyaw8gVBFDdn6rp_NtLhnw',
 'dLIk8-OHVzClGqtb1M91KA',
 'XVa-4GVmeFaNA0Lb1GxHZQ',
 '3DTfc6rVaCrxVm0mbSgBBw',
 'hVVi6XHWC-7RVZOvK1WpOw',
 'Kn72wEO-qb5ecyFoFpTNNQ',
 'hSep-C-1JSC8c_8tR96etQ',
 'v_hmblN_GhNRUHYfC31weA',
 'HZqISq3oXnibkQQ6_pGRQQ',
 'ipyL1DElUA6RbxTfNLMilw',
 '_AiqOS8io_reYZri1OeP1g',
 'UrhZGd0D65hvDJHN3Bz4Kg',
 'JZHvuoPccfil0esnfhlRcQ',
 '8hZghcw4QYJiRB9emn2Rbg',
 'CJb52A28AcHQNMBP8RwDUA',
 'PQ6pDjHP0P28D3CY5UNxDA',
 'JilhqcbhDWsig-tRPs9d2g',
 'Q3BUdZTMw8KRd-3QYtTyAg',
 'FXIygv0F-w6nFICgd_9wCg',
 'bmDUty0odSexELXaEG4ELA',
 'tMMEhYse_vbL-hBaI9pIug',
 '91gV85a7SWXlQkAqg_M2aA',
 'SZtQTQDYIx3eL4GmQ7wfiQ',
 'K_m97yXG81jQZn5zaR0q6Q',
 'sQRbbL_-MQ-MTjdD9VuGbA',
 'jsCDq6Pe-wEWxfBjRyh9ZA',
 'bDtsc7WTwEJZFuvjXy8wng',
 '-Py4QGu7Z2MaN7IZkwKQfg',
 'uAPMb111euz5jJF8_VeO2A',
 'XpZ1i9GLShFmJO6voy5qEQ',
 '1soAAfE7ur81Yrt397_PbA',
 'SmJFJtcCpnZ802BAERUipA',
 '4pueeOSFah1oSst6NkFCGQ',
 'oTUhDAMPbhR5T6y_AO11hw',
 'Ewm7SE3w242jbEEAJJIr0Q',
 'q0PKLCgApK5XynhaZAY3EA',
 'v2GdUAoJ_W4c45b2ZYquWA',
 

## Find all the reviews for Toronto Restaurants

In [6]:
review_cursor = db.review.find({}, {"user_id":1, "business_id":1, "stars":1, "_id":0})

In [7]:
df_review = pd.DataFrame(list(review_cursor))

In [8]:
df_review['is_in_toronto'] = df_review.apply(lambda row: row['business_id'] in set_toronto_biz_id, axis=1)

In [9]:
df_toronto_review = df_review[df_review['is_in_toronto']].reset_index(drop=True)
df_toronto_review.drop(['is_in_toronto'], inplace=True, axis=1)

In [10]:
df_toronto_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525422 entries, 0 to 525421
Data columns (total 3 columns):
user_id        525422 non-null object
business_id    525422 non-null object
stars          525422 non-null float64
dtypes: float64(1), object(2)
memory usage: 12.0+ MB


In [11]:
df_toronto_review.head(10)

Unnamed: 0,user_id,business_id,stars
0,TpyOT5E16YASd7EWjLQlrw,AakkkTuGZA2KBodKi2_u8A,1.0
1,NJlxGtouq06hhC7sS2ECYw,YvrylyuWgbP90RgMqZQVnQ,5.0
2,_N7Ndn29bpll_961oPeEfw,y-Iw6dZflNix4BdwIyTNGA,3.0
3,DbccYu3OppWKl21OanZnTg,YSUcHqlKMPHHJ_cTrqtNrA,1.0
4,54kpqrxF9DEPpwa51hO_Bw,jzveTy7ogH7cg9axZ78ENg,4.0
5,PFNZVn73upq3oZDG2KnAqA,dZVMp70AuSa4dQPvx3J4ow,3.0
6,nNVznu_dQsn43SrakB4Ldw,r395lMbm1ihAQ2sMOcFKow,5.0
7,VrMaL32wWNed_DjOcsO3Ng,ulg83kbgFzXOawlIkfpLww,2.0
8,scitRtsLa4QP9S1LZUIVWQ,dLxT3-EwXkrI9AXoW6HCGg,1.0
9,yT_QCcnq-QGipWWuzIpvtw,i2qKNktpvKOWrREXvMoyKQ,4.0


In [30]:
df_toronto_business_review_count = pd.DataFrame(df_toronto_review.groupby('business_id')['stars'].count())
df_toronto_business_review_count.head()

Unnamed: 0_level_0,stars
business_id,Unnamed: 1_level_1
--DaPTJW3-tB1vP-PfdTEg,49
--SrzpvFLwP_YFwB_Cetow,44
--kinfHwmtdjz03g8B8z8Q,8
-03HVYxkeYWaafEpNJo1SA,5
-0CCHBui57tZ_1y_14X-5Q,6


## Find Correlations

In [13]:
mat_ratings = df_toronto_review.pivot_table(index='user_id',columns='business_id',values='stars')


In [14]:
mat_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113164 entries, --7gjElmOrthETJ8XqzMBw to zzyMMeUZzKAy7KQhM7lU2w
Columns: 18904 entries, --DaPTJW3-tB1vP-PfdTEg to zzvlwkcNR1CCqOPXwuvz2A
dtypes: float64(18904)
memory usage: 15.9+ GB


In [15]:
mat_ratings.head()

business_id,--DaPTJW3-tB1vP-PfdTEg,--SrzpvFLwP_YFwB_Cetow,--kinfHwmtdjz03g8B8z8Q,-03HVYxkeYWaafEpNJo1SA,-0CCHBui57tZ_1y_14X-5Q,-0DwB6Swi349EKfbBAOF7A,-0M3o2uWBnQZwd3hmfEwuw,-0NhdsDJsdarxyDPR523ZQ,-0RRiWDtfnS16AKCtfvBZg,-0aOudcaAyac0VJbMX-L1g,...,zyzDmvnDCAqqIuaThbOFHA,zz-f4Xrs1OGOhybeQaYgFQ,zz3CqZhNx2rQ_Yp6zHze-A,zzH3E5DA8eMzFwv2k5izyw,zzMcX99BPGSOFMZ4boINSQ,zzUj3ej4vm_DtvRxNvWDEw,zzYoocdehksv9_tg80a97w,zze6IysT7bJFS8gvi6fZ2A,zzf3RkMI1Y2E1QaZqeU8yA,zzvlwkcNR1CCqOPXwuvz2A
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--7gjElmOrthETJ8XqzMBw,,,,,,,,,,,...,,,,,,,,,,
--Br-QsbO9ad5GbZxVGxaw,,,,,,,,,,,...,,,,,,,,,,
--BumyUHiO_7YsHurb9Hkw,,,,,,,,,,,...,,,,,,,,,,
--C93xIlmjtgQfSOIpcQSA,,,,,,,,,,,...,,,,,,,,,,
--DKDJlRHfsvufdGSk_Sdw,,,,,,,,,,,...,,,,,,,,,,


In [16]:
mat_ratings.index


Index(['--7gjElmOrthETJ8XqzMBw', '--Br-QsbO9ad5GbZxVGxaw',
       '--BumyUHiO_7YsHurb9Hkw', '--C93xIlmjtgQfSOIpcQSA',
       '--DKDJlRHfsvufdGSk_Sdw', '--EPvMywZ-82a4uzxSwsfg',
       '--EVSb3jbKVL3WJ5NUCuCA', '--GwB-sktmoAOPBsbAaiow',
       '--KQJPdrU0Md97DiOliDzw', '--Qh8yKWAvIP4V4K8ZPfHA',
       ...
       'zzZzP5RhVxASpV_8-s7qvQ', 'zzaq5Fn1U2Feut3dGxidNg',
       'zzgMuJgxmToqcJ5iu1TngQ', 'zzltQbbtriSWWdNINMzJUw',
       'zzmhLxcZ4XZQyz95c_KbOA', 'zzo--VpSQh8PpsGVeMC1dQ',
       'zzoNLUFml9D-FFSkDoGg2g', 'zztkCqqgR6VntYbqio4UTQ',
       'zzwTJAgONYCTeJia4te47g', 'zzyMMeUZzKAy7KQhM7lU2w'],
      dtype='object', name='user_id', length=113164)

In [17]:
test_user_id = "zzaq5Fn1U2Feut3dGxidNg"

In [18]:
test_businesses = mat_ratings.loc[test_user_id].dropna()
test_businesses

business_id
JCqB36SVDjTDsXp-4gBn9Q    5.0
fGurvC5BdOfd5MIuLUQYVA    2.0
jn7h4Uv-RKHflDHsAfqoDA    5.0
yiU-3I-amn5_ACEzuWHglw    4.0
Name: zzaq5Fn1U2Feut3dGxidNg, dtype: float64

In [19]:
bid = "JCqB36SVDjTDsXp-4gBn9Q"
b_stars = mat_ratings.loc[:,bid]
b_stars


user_id
--7gjElmOrthETJ8XqzMBw   NaN
--Br-QsbO9ad5GbZxVGxaw   NaN
--BumyUHiO_7YsHurb9Hkw   NaN
--C93xIlmjtgQfSOIpcQSA   NaN
--DKDJlRHfsvufdGSk_Sdw   NaN
                          ..
zzo--VpSQh8PpsGVeMC1dQ   NaN
zzoNLUFml9D-FFSkDoGg2g   NaN
zztkCqqgR6VntYbqio4UTQ   NaN
zzwTJAgONYCTeJia4te47g   NaN
zzyMMeUZzKAy7KQhM7lU2w   NaN
Name: JCqB36SVDjTDsXp-4gBn9Q, Length: 113164, dtype: float64

In [20]:
similar_to_biz = mat_ratings.corrwith(b_stars)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


In [32]:
corr_biz = pd.DataFrame(similar_to_biz,columns=['correlation'])
corr_biz.dropna(inplace=True)


In [33]:
corr_biz = corr_biz.join(df_toronto_business_review_count)
corr_biz.head()

Unnamed: 0_level_0,correlation,stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
--SrzpvFLwP_YFwB_Cetow,1.0,44
-76didnxGiiMO80BjSpYsQ,0.05976143,154
-9dmhyBvepc08KPEHlEM0w,9.960365e-17,638
-9u2uFwhlj3Yq4mG5IweoA,0.0,157
-BbnAc9YEO6pjvJGEtFbVQ,0.7181848,251


In [34]:
corr_biz.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1254 entries, --SrzpvFLwP_YFwB_Cetow to zzUj3ej4vm_DtvRxNvWDEw
Data columns (total 2 columns):
correlation    1254 non-null float64
stars          1254 non-null int64
dtypes: float64(1), int64(1)
memory usage: 69.4+ KB


In [40]:
corr_biz.loc[ (corr_biz['stars'] >= 100) & (corr_biz['correlation'] < 0.99) & (corr_biz['correlation'] > 0.9),:]


Unnamed: 0_level_0,correlation,stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-ICGmF2qUVKdvOehVNgPbg,0.942809,290
2S9t90Q0qZWCV7X_u0rxZQ,0.911765,250
5aeOewSy4RiI8sLLWpeNGA,0.986117,317
HUYEadSbGSQNHXFmT2Ujjw,0.918559,415
HnbWx7Q8P4-MXbxMpb-qVw,0.942809,147
IRF1hPi3GuxJ-EpoEpzI7Q,0.944911,159
KoZVFpkJzoNt4DCxi5Fzww,0.944911,103
MH4xIHZ5-ajRMW3ZhUvtxA,0.944911,128
QoXT0qI6_3WeHImUuLAyjg,0.927173,118
RXaxKQJtExnRZX1inf8Nyg,0.96225,189
