For Yelpers living in Toronto, which restaurants do you recommend them, when they travel to Las Vegas, US? We want a recommendation model, which can recommend restaurant for travelers, based on their previous actions in Yelp platform.

# Set Up

In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import pprint
import geopy.distance

client = MongoClient()
dblist = client.list_database_names()
db = client.yelp
pp = pprint.PrettyPrinter(indent=4)

# Find Reviews for Restaurants in Toronto

## Find all the "business_id"s for Toronto Restaurants

In [2]:
toronto = {"city":"Toronto", "state":"ON"}
proj = {"business_id":1, "_id":0}
toronto_biz_cursor = db.business.find(toronto, proj)


In [3]:
df_toronto = pd.DataFrame(list(toronto_biz_cursor))
set_toronto_biz_id = set(df_toronto['business_id'])

## Find all the reviews for Toronto Restaurants

In [4]:
review_cursor = db.review.find({}, {"user_id":1, "business_id":1, "stars":1, "_id":0})

In [5]:
df_review = pd.DataFrame(list(review_cursor))

In [6]:
df_review['is_in_toronto'] = df_review.apply(lambda row: row['business_id'] in set_toronto_biz_id, axis=1)

In [7]:
df_toronto_review = df_review[df_review['is_in_toronto']]

In [8]:
df_toronto_review.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 525422 entries, 11 to 6685880
Data columns (total 4 columns):
user_id          525422 non-null object
business_id      525422 non-null object
stars            525422 non-null float64
is_in_toronto    525422 non-null bool
dtypes: bool(1), float64(1), object(2)
memory usage: 16.5+ MB


In [9]:
df_toronto_review.head()

Unnamed: 0,user_id,business_id,stars,is_in_toronto
11,TpyOT5E16YASd7EWjLQlrw,AakkkTuGZA2KBodKi2_u8A,1.0,True
14,NJlxGtouq06hhC7sS2ECYw,YvrylyuWgbP90RgMqZQVnQ,5.0,True
20,_N7Ndn29bpll_961oPeEfw,y-Iw6dZflNix4BdwIyTNGA,3.0,True
32,DbccYu3OppWKl21OanZnTg,YSUcHqlKMPHHJ_cTrqtNrA,1.0,True
45,54kpqrxF9DEPpwa51hO_Bw,jzveTy7ogH7cg9axZ78ENg,4.0,True


## Find Users Who Have Reviewed 20+ Restaurants in Toronto

In [10]:
user_reviews = df_toronto_review.groupby('user_id')['business_id'].apply(list)
# user_reviews = df_toronto_review.groupby(['user_id', 'business_id'], as_index=False)['stars'].apply(list)
# b = pd.DataFrame(a)
# b.loc["zztkCqqgR6VntYbqio4UTQ", "27vF5LESbv4kFQfHZUDCnw"]


In [11]:
df_user_review = pd.DataFrame(user_reviews)

In [12]:
df_user_review.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113164 entries, --7gjElmOrthETJ8XqzMBw to zzyMMeUZzKAy7KQhM7lU2w
Data columns (total 1 columns):
business_id    113164 non-null object
dtypes: object(1)
memory usage: 1.7+ MB


In [13]:
df_user_review.head()

Unnamed: 0_level_0,business_id
user_id,Unnamed: 1_level_1
--7gjElmOrthETJ8XqzMBw,"[9HAfloFDDOH0f8fmA5nkaw, IiG1_hV_TyQgLzh2j8Znc..."
--Br-QsbO9ad5GbZxVGxaw,"[x6PA-2j7LpZAYFo2VojmQQ, 6HjDDG6sVISBMX_SFksPmQ]"
--BumyUHiO_7YsHurb9Hkw,"[NoT7u2QNEvsVbH9MIzAzlw, mUUTJY9_BqmAYxypHiqWr..."
--C93xIlmjtgQfSOIpcQSA,"[9HWdRtNS0q4_UkEvL14IfA, 9HWdRtNS0q4_UkEvL14IfA]"
--DKDJlRHfsvufdGSk_Sdw,[-dSfSU0Nwwh2b0aLo5ifyw]


In [14]:
df_user_review.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113164 entries, --7gjElmOrthETJ8XqzMBw to zzyMMeUZzKAy7KQhM7lU2w
Data columns (total 1 columns):
business_id    113164 non-null object
dtypes: object(1)
memory usage: 1.7+ MB


In [15]:
df_user_review['num_toronto_reviews'] = df_user_review.apply(lambda row: len(row['business_id']), axis=1)

In [16]:
df_user_review.head()

Unnamed: 0_level_0,business_id,num_toronto_reviews
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
--7gjElmOrthETJ8XqzMBw,"[9HAfloFDDOH0f8fmA5nkaw, IiG1_hV_TyQgLzh2j8Znc...",6
--Br-QsbO9ad5GbZxVGxaw,"[x6PA-2j7LpZAYFo2VojmQQ, 6HjDDG6sVISBMX_SFksPmQ]",2
--BumyUHiO_7YsHurb9Hkw,"[NoT7u2QNEvsVbH9MIzAzlw, mUUTJY9_BqmAYxypHiqWr...",50
--C93xIlmjtgQfSOIpcQSA,"[9HWdRtNS0q4_UkEvL14IfA, 9HWdRtNS0q4_UkEvL14IfA]",2
--DKDJlRHfsvufdGSk_Sdw,[-dSfSU0Nwwh2b0aLo5ifyw],1


In [17]:
df_user_review.columns

Index(['business_id', 'num_toronto_reviews'], dtype='object')

In [18]:
df_toronto_experts = df_user_review[df_user_review['num_toronto_reviews'] > 20]

In [19]:
df_toronto_experts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3925 entries, --BumyUHiO_7YsHurb9Hkw to zyg4-MFtfPWmwucVazSjfw
Data columns (total 2 columns):
business_id            3925 non-null object
num_toronto_reviews    3925 non-null int64
dtypes: int64(1), object(1)
memory usage: 92.0+ KB


# Make Recommendations for Toronto

In [20]:
df_few_toronto_reviews = df_user_review[df_user_review['num_toronto_reviews'] <= 20]

In [21]:
df_few_toronto_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109239 entries, --7gjElmOrthETJ8XqzMBw to zzyMMeUZzKAy7KQhM7lU2w
Data columns (total 2 columns):
business_id            109239 non-null object
num_toronto_reviews    109239 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


In [22]:
df_few_toronto_reviews.head(10)

Unnamed: 0_level_0,business_id,num_toronto_reviews
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
--7gjElmOrthETJ8XqzMBw,"[9HAfloFDDOH0f8fmA5nkaw, IiG1_hV_TyQgLzh2j8Znc...",6
--Br-QsbO9ad5GbZxVGxaw,"[x6PA-2j7LpZAYFo2VojmQQ, 6HjDDG6sVISBMX_SFksPmQ]",2
--C93xIlmjtgQfSOIpcQSA,"[9HWdRtNS0q4_UkEvL14IfA, 9HWdRtNS0q4_UkEvL14IfA]",2
--DKDJlRHfsvufdGSk_Sdw,[-dSfSU0Nwwh2b0aLo5ifyw],1
--EPvMywZ-82a4uzxSwsfg,[EswKVP74LXou2F4wyavNUQ],1
--EVSb3jbKVL3WJ5NUCuCA,[uIuJGLPWli4dbomvANcuiQ],1
--GwB-sktmoAOPBsbAaiow,"[tMxzAzHSFJWjSu6CEjwtJQ, gz1FhM5W1ld-dwgYV5xRZQ]",2
--KQJPdrU0Md97DiOliDzw,[eQel7bUz75j0AVKnfsillg],1
--RYvmB6UYRyZQqXkBv4eQ,"[MsUI2BgJjptqa1YEAqBXuA, ZMbY3xmTJ3EB2f1ZFyB5Dg]",2
--UOvCH5qEgdNQ8lzR8QYQ,"[UrV2WIaiFHRI8xxOZ00g0w, ZCrK07xb6w5Vi1vathV0N...",10


## Find Reviewers (with 20+ Toronto Reviews) who have reviewed the restaurants reviewed by the test user

In [26]:
test_user = "--7gjElmOrthETJ8XqzMBw"
test_user_businesses = df_few_toronto_reviews.loc[test_user]['business_id']

In [27]:
test_user_businesses

['9HAfloFDDOH0f8fmA5nkaw',
 'IiG1_hV_TyQgLzh2j8Zncg',
 'UxWH8zRYIBgs6Q2oykvRdw',
 'byN9qRmL5YhKzYQtAFDulw',
 'GTNhbajbPNao5ITndlYy6Q',
 'brNYDrnZhjZjbef9iXQVQw']

user_id
--BumyUHiO_7YsHurb9Hkw    [NoT7u2QNEvsVbH9MIzAzlw, mUUTJY9_BqmAYxypHiqWr...
--Qh8yKWAvIP4V4K8ZPfHA    [zrYpLdnGKA_EmOhgRCy_vg, 6551kEXeYF2NCwlodvk1E...
--YhjyV-ce1nFLYxP49C5A    [S2yp22ExErM1wtpUgPC3TQ, SEG56Wws3Wb2AXyz-eohl...
-1wbglcr6x1qrUbqP1YAIA    [ibNxVcSE7peefvDUGEFvzQ, A-QlS6_vpJI_Yi67AyNJL...
-2kCxY7_aw5hOz7fJnGMbQ    [e59uNE07Gf-yrIl4rX3KZw, Nxg73OigmRQQq0d1pKtkU...
                                                ...                        
zwDazUH1n6KLmWmJXJj7Rg    [e59uNE07Gf-yrIl4rX3KZw, n9BrA1zIwlbiP1hLAc24j...
zwmThlRiY1LHfJ_7zNfGqg    [W9wFhYz0hj9mQ_5vHfk31A, UC2dfjZg8mrRh10maovQf...
zy-bRNv9Wp3fmNM4TD9kIw    [ibNxVcSE7peefvDUGEFvzQ, _zV-v1iMDIs4bjVuN_fL5...
zyYWUdaodH0h1jCZAvFRPg    [BSchC5THOB0MFmzJTEqAnQ, OIdOJaNS8M624F58XGV3P...
zyg4-MFtfPWmwucVazSjfw    [3a7Qby_IX7sU7O6ZsQZeOQ, yG2PdMFn6y3QDw4Uq4b5l...
Name: business_id, Length: 3925, dtype: object