# 1.Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import os
import zipfile
import json
import copy
import warnings
warnings.filterwarnings('ignore')

# 2. Scrape Yelp Categories

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
# specify the URL
url = 'https://docs.developer.yelp.com/docs/resources-categories'

# make the HTTP request and fetch the information from the specified URL
results = requests.get(url)

# requests.get(url) returns a Response object called 'results'
# this Response object has attributes that contain all of the information returned from the HTTP request
# https://docs.python-requests.org/en/latest/user/quickstart/

# convert the webpage returned from 'results' to a workable Python object with BeautifulSoup
bs_object = BeautifulSoup(results.text, 'html.parser')

print(type(results))
print(type(bs_object))

<class 'requests.models.Response'>
<class 'bs4.BeautifulSoup'>


In [4]:
# Signature: find_all(name, attrs, recursive, string, limit, **kwargs)
# name -> name of the tag to access
# attrs = dict object to specify which tags to access with matching attributes

name = 'ul'
attrs = {'class':'bullet-list-round category-list'}
categories = bs_object.find(name=name, attrs=attrs)

In [5]:
# find all relvant and direct children

def find_direct_children(parent):
  direct_children = []
  for idx, child in enumerate(parent.children):
    if not isinstance(child, str):
      if child.name=='li' or child.name=='ul':
        direct_children.append(child)

  return direct_children

In [6]:
def flatten_branch(subcategories):
  branch = [] # relative to each function call
  for subcategory in subcategories:
    if subcategory.name=='ul':
      branch.extend(flatten_branch(find_direct_children(subcategory)))

    elif subcategory.name=='li':
      branch.append(subcategory.text)

  return branch
        

In [7]:
# flatten tree
flattened_tree = []
parents = []
parent=None

for child in find_direct_children(categories):
  # li tag will always be first in the Yelp Categories tree
  if child.name=='li':
    parent=child
    parents.append(parent)
    
  # flatten root categories with depth>1
  # li will always preceede ul
  elif child.name=='ul':
    subcategories = flatten_branch(find_direct_children(child))
    flattened_tree.append([parent.text, subcategories])



# 3.Build Dictionary

In [8]:
branches = dict()

for branch in flattened_tree:
  parent = branch[0]
  parent_name = str.strip(re.findall('^[\w\s]+', parent)[0])
  parent_code = str.strip(re.findall('(?<=\()\w+', parent)[0])

  children = branch[1]
  for child in children:
    child_name=str.strip(re.findall('^[\w\s]+', child)[0])
    # child_code=str.strip(re.findall('(?<=\()\w+', child)[0])
    child_code=str.strip(re.findall('(?<=\()[a-z\d\_]+', child)[0])
    branches[child_code]=[parent_code, parent_name]

In [9]:
len(branches)

1542

In [10]:
# sanity check
branches['hookah_bars']

['nightlife', 'Nightlife']

In [11]:
# sanity check
branches['newamerican']

['restaurants', 'Restaurants']

In [12]:
# sanity check
branches['3dprinting']

['localservices', 'Local Services']

# 4.Import Datasets

In [13]:
det_list_df = pd.read_csv('./train_test_data_nashville.csv', dtype={'Unnamed: 0':str})

In [14]:
det_list_df.head()

Unnamed: 0.1,Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,price
0,72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,104.616438
1,431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,351.986301
2,329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,127.887671
3,1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,133.876712
4,632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,163.739726


In [15]:
det_list_df.shape

(6738, 12)

In [16]:
det_list_df=det_list_df.rename(columns={'Unnamed: 0': 'id'})

## 4.1 Import Fetched Yelp Businesses

In [17]:
for file in os.listdir(os.getcwd()):
  if re.match('AirBnB Processed\s\d{4}_\d{2}_\d{2}.zip', file) or re.match('Yelp Fetched\s\d{4}_\d{2}_\d{2}.zip', file):
    with zipfile.ZipFile(file, 'r') as zip_ref:
      try:
        zip_ref.extractall('./')
        print('Unzipped: {}'.format(file))
      except:
        print('Failed to unzip files')

Unzipped: Yelp Fetched 2023_03_23.zip
Unzipped: Yelp Fetched 2023_03_25.zip
Unzipped: Yelp Fetched 2023_03_26.zip


In [18]:
# processed_listings = pd.DataFrame()

# for file in os.listdir(os.getcwd()):
#   if os.path.isdir(file) and re.match('AirBnB Processed\s\d{4}_\d{2}_\d{2}', file):
#     for sub_file in os.listdir(os.getcwd()+'/'+file):
#       if re.match('processed_airbnb_batch_\d_\d{4}_\d{2}_\d{2}\s\d{2}_\d{2}_\d{2}.csv', sub_file):
#         print(sub_file)
#         found_file = re.findall('processed_airbnb_batch_\d_\d{4}_\d{2}_\d{2}\s\d{2}_\d{2}_\d{2}.csv', sub_file)[0]
#         cached_listings = pd.read_csv(os.getcwd()+'/'+file+'/'+sub_file, dtype={'AirBnB_id':str})
#         if processed_listings.empty:
#           processed_listings = cached_listings
#         else:
#           processed_listings = pd.concat([processed_listings, cached_listings])

In [19]:
# processed_listings['AirBnB_id'].nunique()

In [20]:
# det_list_df.shape

In [21]:
yelp_businesses = pd.DataFrame()

for file in os.listdir(os.getcwd()):
  if os.path.isdir(file) and re.match('Yelp Fetched\s\d{4}_\d{2}_\d{2}', file):
    for sub_file in os.listdir(os.getcwd()+'/'+file):
      if re.match('yelp_fetched_batch_\d_\d{4}_\d{2}_\d{2}\s\d{2}_\d{2}_\d{2}.csv', sub_file):
        print(sub_file)
        found_file = re.findall('yelp_fetched_batch_\d_\d{4}_\d{2}_\d{2}\s\d{2}_\d{2}_\d{2}.csv', sub_file)[0]
        cached_businesses = pd.read_csv(os.getcwd()+'/'+file+'/'+sub_file, dtype={'AirBnB_id':str})
        if yelp_businesses.empty:
          yelp_businesses = cached_businesses
        else:
          yelp_businesses = pd.concat([yelp_businesses, cached_businesses])

yelp_fetched_batch_2_2023_03_25 19_55_18.csv
yelp_fetched_batch_1_2023_03_25 19_55_18.csv
yelp_fetched_batch_3_2023_03_25 19_55_19.csv
yelp_fetched_batch_4_2023_03_25 19_55_19.csv
yelp_fetched_batch_5_2023_03_25 19_55_19.csv
yelp_fetched_batch_0_2023_03_25 19_55_18.csv
yelp_fetched_batch_1_2023_03_23 19_43_00.csv
yelp_fetched_batch_3_2023_03_26 15_56_57.csv
yelp_fetched_batch_2_2023_03_26 15_56_56.csv
yelp_fetched_batch_5_2023_03_26 15_56_57.csv
yelp_fetched_batch_1_2023_03_26 15_56_56.csv
yelp_fetched_batch_0_2023_03_26 15_56_56.csv
yelp_fetched_batch_4_2023_03_26 15_56_57.csv


In [22]:
yelp_businesses.shape

(134760, 27)

# 5.Get Categories from Businesses
* [Search businesses](https://docs.developer.yelp.com/reference/v3_business_search)

## 5.1 Get Sample

In [23]:
cols=[
  'id', 
  'name',
  'review_count',
  'categories',
  'rating',
  'coordinates.latitude',
  'coordinates.longitude',
  'location.address1',
  'location.city',
  'location.state',
  'distance',
  'AirBnB_id'
]

sample_business = yelp_businesses.sample()[cols]
sample_business

Unnamed: 0,id,name,review_count,categories,rating,coordinates.latitude,coordinates.longitude,location.address1,location.city,location.state,distance,AirBnB_id
9024,xlMQBBt9wrtahdqiRDcVSg,The Stillery,3074,"[{'alias': 'newamerican', 'title': 'American (...",4.5,36.16225,-86.7757,113 2nd Ave N,Nashville,TN,1821.108969,541925379681878334


In [24]:
sample_business.dtypes

id                        object
name                      object
review_count               int64
categories                object
rating                   float64
coordinates.latitude     float64
coordinates.longitude    float64
location.address1         object
location.city             object
location.state            object
distance                 float64
AirBnB_id                 object
dtype: object

In [25]:
sample_business.categories.values[0]

"[{'alias': 'newamerican', 'title': 'American (New)'}, {'alias': 'bars', 'title': 'Bars'}]"

## 5.2 Correct Cast Type for Categories Column

In [26]:
import ast

def str_list_to_list(row):
   return ast.literal_eval(row.categories)

In [27]:
yelp_businesses['categories_list'] = yelp_businesses.apply(lambda x: str_list_to_list(x), axis=1)

In [28]:
sample = yelp_businesses.sample()
sample.categories_list.values[0]

[{'alias': 'italian', 'title': 'Italian'},
 {'alias': 'pizza', 'title': 'Pizza'},
 {'alias': 'bars', 'title': 'Bars'}]

In [29]:
type(sample.categories_list.values[0])

list

## 5.3 Get Categories for Each Business

In [30]:
def get_categories(row):
  extracted_cat = list()

  categories = row['categories_list']
  for cat in categories:
    extracted_cat.append(cat['alias'])

  return extracted_cat

In [31]:
yelp_businesses['categories_alias'] = yelp_businesses.apply(lambda x: get_categories(x), axis=1)

In [32]:
sample = yelp_businesses.sample()
sample.categories.values[0]

"[{'alias': 'southern', 'title': 'Southern'}, {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'}, {'alias': 'tradamerican', 'title': 'American (Traditional)'}]"

In [33]:
sample.categories_list.values[0]

[{'alias': 'southern', 'title': 'Southern'},
 {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'},
 {'alias': 'tradamerican', 'title': 'American (Traditional)'}]

In [34]:
sample.categories_alias.values[0]

['southern', 'breakfast_brunch', 'tradamerican']

## 5.4 Map Each Category to Main Category

In [35]:
def get_main_category(row):
  main_cat = set()

  primary_cat = ['active', 'arts', 'food', 'nightlife', 'restaurants', 'shopping']

  categories = row['categories_alias']
  for cat in categories:
    if cat in branches:
      if branches[cat][0] in primary_cat:
        main_cat.add(branches[cat][0])

  return list(main_cat)

In [36]:
yelp_businesses['categories_main'] = yelp_businesses.apply(lambda x: get_main_category(x), axis=1)

In [37]:
sample = yelp_businesses.sample()
sample.categories.values[0]

"[{'alias': 'southern', 'title': 'Southern'}, {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'}, {'alias': 'tradamerican', 'title': 'American (Traditional)'}]"

In [38]:
sample.categories_list.values[0]

[{'alias': 'southern', 'title': 'Southern'},
 {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'},
 {'alias': 'tradamerican', 'title': 'American (Traditional)'}]

In [39]:
sample.categories_alias.values[0]

['southern', 'breakfast_brunch', 'tradamerican']

In [40]:
sample.categories_main

15422    [restaurants]
Name: categories_main, dtype: object

## 5.5 Count Nearby Entertainment

In [41]:
yelp_businesses['distance_miles'] = yelp_businesses['distance']*0.000621371 # meters to miles

In [42]:
#(a,b], (a, inf)
# OR [a,b), [a, inf)

# intervals = [(0,1), (1,5), (5)] 
intervals = [(0,2), (2,4), (4)]
# intervals = [(0,2), (2,5), (5)]

In [43]:
def count_nearby_ent(df, interval, flip=False):

  df_updated = df.copy(deep=True)
  interval_cols = []
  for interval in intervals:

    # (a,b], (a, inf)
    if not flip: 
      if isinstance(interval, tuple):
        lower_bound=interval[0]
        upper_bound=interval[1]
        col = 'within_{}_{}'.format(lower_bound, upper_bound)
        df_updated[col] = df_updated.distance_miles.apply(lambda x: 1 if x > lower_bound and x <= upper_bound else 0 )

      # handle places of nearby entertainment beyond the radius=7 set in the Yelp API call
      else:
        lower_bound=interval
        col = 'beyond_{}'.format(lower_bound)
        df_updated[col] = df_updated.distance_miles.apply(lambda x: 1 if x > lower_bound else 0)

    # [a,b), [a, inf)
    else:
      if isinstance(interval, tuple):
        lower_bound=interval[0]
        upper_bound=interval[1]
        col = 'within_{}_{}'.format(lower_bound, upper_bound)
        df_updated[col] = df_updated.distance_miles.apply(lambda x: 1 if x >= lower_bound and x < upper_bound else 0)

      # handle places of nearby entertainment beyond the radius=7 set in the Yelp API call
      else:
        lower_bound=interval
        col = 'beyond_{}'.format(lower_bound)
        df_updated[col] = df_updated.distance_miles.apply(lambda x: 1 if x >= lower_bound else 0)

    interval_cols.append(col)

  return (df_updated, interval_cols)
  

In [44]:
yelp_businesses_updated, interval_cols = count_nearby_ent(yelp_businesses, intervals, True)

In [45]:
interval_cols

['within_0_2', 'within_2_4', 'beyond_4']

In [46]:
yelp_businesses_updated.sample(n=5)[['id', 'name', 'AirBnB_id', 'categories_main', 'distance', 'distance_miles']+interval_cols]

Unnamed: 0,id,name,AirBnB_id,categories_main,distance,distance_miles,within_0_2,within_2_4,beyond_4
9197,Jzb2IQSwPfULWzdl9Bq5fg,Monell's Dining & Catering,660006100239755718,[restaurants],2760.410221,1.715239,1,0,0
3294,2OXCAbz3x_MrQSYj1zarIA,Pinewood Social,48826258,"[restaurants, nightlife, arts]",1457.936874,0.90592,1,0,0
241,GXFMD0Z4jEVZBCsbPf4CTQ,Hattie B’s Hot Chicken - Nashville - Midtown,765369948212991090,[restaurants],9863.294878,6.128765,0,0,1
15088,Gi4ex2Tq7ggRlpN0S7k5Xg,Jeni's Splendid Ice Creams,17072555,[food],915.15969,0.568654,1,0,0
2515,wUZXm4KN2wIqlfkvXVZXUw,Crema Coffee Roasters,37559742,[food],2716.351409,1.687862,1,0,0


In [50]:
exploded_df = yelp_businesses_updated.explode('categories_main')

In [51]:
agg_dict = dict()
for col in interval_cols:
  agg_dict[col]=np.sum

grouped_df = exploded_df.groupby(['AirBnB_id', 'categories_main']).agg(agg_dict).reset_index()

In [52]:
grouped_df.head(n=6)

Unnamed: 0,AirBnB_id,categories_main,within_0_2,within_2_4,beyond_4
0,10017234,active,1,0,0
1,10017234,food,1,2,0
2,10017234,nightlife,0,3,1
3,10017234,restaurants,8,4,5
4,10036680,arts,1,0,0
5,10036680,food,0,2,0


In [53]:
# sanity check
yelp_businesses_updated[yelp_businesses_updated['AirBnB_id']=='10017234']['categories'].values

array(["[{'alias': 'chicken_wings', 'title': 'Chicken Wings'}, {'alias': 'chickenshop', 'title': 'Chicken Shop'}, {'alias': 'sandwiches', 'title': 'Sandwiches'}]",
       "[{'alias': 'thai', 'title': 'Thai'}, {'alias': 'asianfusion', 'title': 'Asian Fusion'}, {'alias': 'noodles', 'title': 'Noodles'}]",
       "[{'alias': 'mexican', 'title': 'Mexican'}, {'alias': 'bars', 'title': 'Bars'}]",
       "[{'alias': 'cafes', 'title': 'Cafes'}, {'alias': 'vegetarian', 'title': 'Vegetarian'}, {'alias': 'vegan', 'title': 'Vegan'}]",
       "[{'alias': 'southern', 'title': 'Southern'}, {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'}, {'alias': 'tradamerican', 'title': 'American (Traditional)'}]",
       "[{'alias': 'bbq', 'title': 'Barbeque'}]",
       "[{'alias': 'newamerican', 'title': 'American (New)'}, {'alias': 'bars', 'title': 'Bars'}]",
       "[{'alias': 'mideastern', 'title': 'Middle Eastern'}, {'alias': 'kebab', 'title': 'Kebab'}, {'alias': 'turkish', 'title': 'Turkish'}]",


In [54]:
yelp_businesses_updated[yelp_businesses_updated['AirBnB_id']=='10017234']['categories_main'].values

array([list(['restaurants']), list(['restaurants']),
       list(['restaurants', 'nightlife']), list(['restaurants']),
       list(['restaurants']), list(['restaurants']),
       list(['restaurants', 'nightlife']), list(['restaurants']),
       list(['restaurants']), list(['food']),
       list(['restaurants', 'nightlife']), list(['active']),
       list(['restaurants']), list(['restaurants']),
       list(['restaurants']), list(['food']), list(['restaurants']),
       list(['restaurants', 'food']), list(['restaurants']),
       list(['restaurants', 'nightlife'])], dtype=object)

## 5.6 Pivot and Merge

In [55]:
pivots = []

for interval in intervals:
  if isinstance(interval, tuple):
    lower_bound=interval[0]
    upper_bound=interval[1]
    col = 'within_{}_{}'.format(lower_bound, upper_bound)
 
  # handle places of nearby entertainment beyond the radius=7 set in the Yelp API call
  else:
    lower_bound=interval
    col = 'beyond_{}'.format(lower_bound)

  df = grouped_df.pivot(index='AirBnB_id', columns=['categories_main'], values=col).reset_index()
  df = df.fillna(0)

  cols={'active':int, 'arts':int, 'food':int, 'nightlife':int, 'restaurants':int, 'shopping':int}
  df = df.astype(cols)

  cols_rename={
      'active': 'active'+ '_' +col,
      'arts': 'arts'+ '_' +col,
      'food': 'food'+ '_' +col,
      'nightlife': 'nightlife'+ '_' +col,
      'restaurants': 'restaurants'+ '_' +col,
      'shopping': 'shopping'+ '_' +col,
      }
  df = df.rename(columns=cols_rename)
  
  df.columns.name = None
  pivots.append(df)


In [56]:
pivots[0].dtypes

AirBnB_id                 object
active_within_0_2          int64
arts_within_0_2            int64
food_within_0_2            int64
nightlife_within_0_2       int64
restaurants_within_0_2     int64
shopping_within_0_2        int64
dtype: object

In [57]:
pivots[0].shape

(6738, 7)

In [58]:
pivots[0].head()

Unnamed: 0,AirBnB_id,active_within_0_2,arts_within_0_2,food_within_0_2,nightlife_within_0_2,restaurants_within_0_2,shopping_within_0_2
0,10017234,1,0,1,0,8,0
1,10036680,0,1,0,1,4,0
2,10056974,0,0,0,0,0,0
3,1006989,0,0,2,2,8,0
4,1009550,0,1,0,1,3,0


In [59]:
pivots[1].dtypes

AirBnB_id                 object
active_within_2_4          int64
arts_within_2_4            int64
food_within_2_4            int64
nightlife_within_2_4       int64
restaurants_within_2_4     int64
shopping_within_2_4        int64
dtype: object

In [60]:
pivots[1].shape

(6738, 7)

In [61]:
pivots[1].head()

Unnamed: 0,AirBnB_id,active_within_2_4,arts_within_2_4,food_within_2_4,nightlife_within_2_4,restaurants_within_2_4,shopping_within_2_4
0,10017234,0,0,2,3,4,0
1,10036680,0,0,2,2,7,0
2,10056974,0,0,6,4,10,0
3,1006989,0,1,1,2,9,0
4,1009550,0,0,2,3,8,0


In [62]:
pivots[2].dtypes

AirBnB_id               object
active_beyond_4          int64
arts_beyond_4            int64
food_beyond_4            int64
nightlife_beyond_4       int64
restaurants_beyond_4     int64
shopping_beyond_4        int64
dtype: object

In [63]:
pivots[2].shape

(6738, 7)

In [64]:
pivots[2].head()

Unnamed: 0,AirBnB_id,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4
0,10017234,0,0,0,1,5,0
1,10036680,0,0,0,1,6,0
2,10056974,0,0,0,1,6,0
3,1006989,0,0,0,0,0,0
4,1009550,0,0,0,1,6,0


In [65]:
merged_df = pd.DataFrame()

for df in pivots:
  if merged_df.empty:
    merged_df=df
  else:
    merged_df = merged_df.merge(df, how='inner', on='AirBnB_id').reset_index()


In [66]:
merged_df.shape

(6738, 21)

In [67]:
merged_df.dtypes

level_0                    int64
index                      int64
AirBnB_id                 object
active_within_0_2          int64
arts_within_0_2            int64
food_within_0_2            int64
nightlife_within_0_2       int64
restaurants_within_0_2     int64
shopping_within_0_2        int64
active_within_2_4          int64
arts_within_2_4            int64
food_within_2_4            int64
nightlife_within_2_4       int64
restaurants_within_2_4     int64
shopping_within_2_4        int64
active_beyond_4            int64
arts_beyond_4              int64
food_beyond_4              int64
nightlife_beyond_4         int64
restaurants_beyond_4       int64
shopping_beyond_4          int64
dtype: object

In [68]:
cols = merged_df.columns.values[2:]
merged_df = merged_df[cols]
merged_df.head()

Unnamed: 0,AirBnB_id,active_within_0_2,arts_within_0_2,food_within_0_2,nightlife_within_0_2,restaurants_within_0_2,shopping_within_0_2,active_within_2_4,arts_within_2_4,food_within_2_4,nightlife_within_2_4,restaurants_within_2_4,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4
0,10017234,1,0,1,0,8,0,0,0,2,3,4,0,0,0,0,1,5,0
1,10036680,0,1,0,1,4,0,0,0,2,2,7,0,0,0,0,1,6,0
2,10056974,0,0,0,0,0,0,0,0,6,4,10,0,0,0,0,1,6,0
3,1006989,0,0,2,2,8,0,0,1,1,2,9,0,0,0,0,0,0,0
4,1009550,0,1,0,1,3,0,0,0,2,3,8,0,0,0,0,1,6,0


# 6. Export Results

In [69]:
merged_df.to_csv('./nearby_entertainment.csv')

In [70]:
# sanity check
main_df = det_list_df.merge(merged_df, how='left', left_on='id', right_on='AirBnB_id')

In [71]:
main_df.shape

(6738, 31)

In [72]:
ent_count = main_df[cols[1:]].sum(axis=1)
ent_count.head()

0    30
1    26
2    26
3    25
4    26
dtype: int64

In [73]:
ent_count[ent_count<20]

Series([], dtype: int64)