# Part 1.5: Cleaning new Yelp API dataset

Due to limits (missing major cities and price values) in the Yelp academic dataset, we have collected additional Yelp data using their Yelp Fusion API. Please refer to notebooks "Yelp_zipcode_api_pull.ipynb" 

**This notebook performs the following:**
1. Loads preliminarily cleaned data from csv file (see "Restaurant_data_merge_and_clean.ipynb")
2. Captures all unique categories
3. Filters categories for restaurant or bar related establishments (primary activity should be to serve food and drink)
4. Exports processed data to a local SQL for further processing (data is grouped by zip code using SQL)
5. Imports processed data from SQL and performs count of diversity of restaurant types by zip code

In [1]:
# Dependencies
import os
import pandas as pd
import numpy as np
import re
import ast

import dataframe_image as dfi

In [2]:
# Load preprocessed data from API scraping
# api_data_df = pd.read_csv('../Restaurants_and_House_Prices/Final/Resources_final/yelp_api_final.csv')
# Correct data (foreign zip codes removed by merging with US housing data):
api_data_df = pd.read_csv('../Restaurants_and_House_Prices/Final/Resources_final/yelp_housing_merge.csv')

In [3]:
api_data_df

Unnamed: 0.1,Unnamed: 0,postal_code,City,State,CountyName,2021,latitude,longitude,review_count,rating,categories,price,delivery,pickup,restaurant_reservation,price_value_1.0,price_value_2.0,price_value_3.0,price_value_4.0
0,0,60657.0,Chicago,IL,Cook County,507204.0,41.936430,-87.661410,354,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",2.0,1,1,0,0,1,0,0
1,1,60657.0,Chicago,IL,Cook County,507204.0,41.940150,-87.653860,212,4.5,"[{'alias': 'wine_bars', 'title': 'Wine Bars'},...",2.0,1,0,0,0,1,0,0
2,2,60657.0,Chicago,IL,Cook County,507204.0,41.937420,-87.648300,60,5.0,"[{'alias': 'pizza', 'title': 'Pizza'}]",0.0,1,0,0,0,0,0,0
3,3,60657.0,Chicago,IL,Cook County,507204.0,41.942829,-87.649185,593,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",2.0,1,1,0,0,1,0,0
4,4,60657.0,Chicago,IL,Cook County,507204.0,41.947133,-87.646892,35,5.0,"[{'alias': 'mexican', 'title': 'Mexican'}]",0.0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240584,243462,7083.0,Union,NJ,Union County,414081.0,40.696799,-74.251397,339,4.0,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",2.0,1,0,0,0,1,0,0
240585,243463,7083.0,Union,NJ,Union County,414081.0,40.697124,-74.268196,258,3.5,"[{'alias': 'chicken_wings', 'title': 'Chicken ...",2.0,0,1,0,0,1,0,0
240586,243464,7083.0,Union,NJ,Union County,414081.0,40.697392,-74.268416,43,4.5,"[{'alias': 'vietnamese', 'title': 'Vietnamese'}]",0.0,1,1,0,0,0,0,0
240587,243465,7083.0,Union,NJ,Union County,414081.0,40.696692,-74.269194,605,4.0,"[{'alias': 'cafes', 'title': 'Cafes'}, {'alias...",2.0,1,1,0,0,1,0,0


In [4]:
# Explore
api_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240589 entries, 0 to 240588
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              240589 non-null  int64  
 1   postal_code             240589 non-null  float64
 2   City                    240589 non-null  object 
 3   State                   240589 non-null  object 
 4   CountyName              240589 non-null  object 
 5   2021                    240589 non-null  float64
 6   latitude                240583 non-null  float64
 7   longitude               240583 non-null  float64
 8   review_count            240589 non-null  int64  
 9   rating                  240589 non-null  float64
 10  categories              240589 non-null  object 
 11  price                   240589 non-null  float64
 12  delivery                240589 non-null  int64  
 13  pickup                  240589 non-null  int64  
 14  restaurant_reservati

In [5]:
# Drop previous index and price columns
api_data_df.drop(columns=['Unnamed: 0', 'price'],axis=1, inplace=True)

In [6]:
api_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240589 entries, 0 to 240588
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   postal_code             240589 non-null  float64
 1   City                    240589 non-null  object 
 2   State                   240589 non-null  object 
 3   CountyName              240589 non-null  object 
 4   2021                    240589 non-null  float64
 5   latitude                240583 non-null  float64
 6   longitude               240583 non-null  float64
 7   review_count            240589 non-null  int64  
 8   rating                  240589 non-null  float64
 9   categories              240589 non-null  object 
 10  delivery                240589 non-null  int64  
 11  pickup                  240589 non-null  int64  
 12  restaurant_reservation  240589 non-null  int64  
 13  price_value_1.0         240589 non-null  int64  
 14  price_value_2.0     

In [7]:
# Explore null values in postal_code column
api_data_df.postal_code.isna().sum()

0

In [8]:
len(api_data_df)

240589

In [9]:
# Explore postal codes
api_data_df.postal_code.value_counts()

11354.0    230
11211.0    223
10013.0    210
10003.0    208
10019.0    205
          ... 
55956.0      1
47983.0      1
74932.0      1
84339.0      1
31801.0      1
Name: postal_code, Length: 13988, dtype: int64

In [11]:
# Explore states
api_data_df.State.value_counts()
# Seems correct - US states only

CA    23056
NY    22705
TX    22637
FL    11960
OH    11943
PA    10983
IL    10316
MI     7572
NC     7162
GA     7108
IN     7058
VA     6343
TN     6291
MO     5940
WA     4969
AZ     4954
CO     4676
AL     4458
WI     4366
KY     4314
MN     4196
SC     4171
LA     3607
OK     3538
OR     3489
MD     3392
IA     3344
AR     3129
KS     3032
NV     2476
NE     2313
MS     2298
WV     1875
UT     1533
NM     1257
DE     1138
DC      977
NJ      678
ND      676
HI      674
ID      621
MT      615
SD      613
AK      603
CT      562
WY      510
VT      434
MA       24
NH        2
RI        1
Name: State, dtype: int64

### Create dummies for ratings

In [12]:
# Create new column
api_data_df['stars']=api_data_df['rating']

In [13]:
api_data_df.head()

Unnamed: 0,postal_code,City,State,CountyName,2021,latitude,longitude,review_count,rating,categories,delivery,pickup,restaurant_reservation,price_value_1.0,price_value_2.0,price_value_3.0,price_value_4.0,stars
0,60657.0,Chicago,IL,Cook County,507204.0,41.93643,-87.66141,354,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",1,1,0,0,1,0,0,4.0
1,60657.0,Chicago,IL,Cook County,507204.0,41.94015,-87.65386,212,4.5,"[{'alias': 'wine_bars', 'title': 'Wine Bars'},...",1,0,0,0,1,0,0,4.5
2,60657.0,Chicago,IL,Cook County,507204.0,41.93742,-87.6483,60,5.0,"[{'alias': 'pizza', 'title': 'Pizza'}]",1,0,0,0,0,0,0,5.0
3,60657.0,Chicago,IL,Cook County,507204.0,41.942829,-87.649185,593,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",1,1,0,0,1,0,0,4.0
4,60657.0,Chicago,IL,Cook County,507204.0,41.947133,-87.646892,35,5.0,"[{'alias': 'mexican', 'title': 'Mexican'}]",1,1,0,0,0,0,0,5.0


In [14]:
api_expand_df = pd.get_dummies(api_data_df,columns=['stars'])

In [15]:
api_expand_df.head()

Unnamed: 0,postal_code,City,State,CountyName,2021,latitude,longitude,review_count,rating,categories,...,stars_0.0,stars_1.0,stars_1.5,stars_2.0,stars_2.5,stars_3.0,stars_3.5,stars_4.0,stars_4.5,stars_5.0
0,60657.0,Chicago,IL,Cook County,507204.0,41.93643,-87.66141,354,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0,0,0,0,0,0,0,1,0,0
1,60657.0,Chicago,IL,Cook County,507204.0,41.94015,-87.65386,212,4.5,"[{'alias': 'wine_bars', 'title': 'Wine Bars'},...",...,0,0,0,0,0,0,0,0,1,0
2,60657.0,Chicago,IL,Cook County,507204.0,41.93742,-87.6483,60,5.0,"[{'alias': 'pizza', 'title': 'Pizza'}]",...,0,0,0,0,0,0,0,0,0,1
3,60657.0,Chicago,IL,Cook County,507204.0,41.942829,-87.649185,593,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0,0,0,0,0,0,0,1,0,0
4,60657.0,Chicago,IL,Cook County,507204.0,41.947133,-87.646892,35,5.0,"[{'alias': 'mexican', 'title': 'Mexican'}]",...,0,0,0,0,0,0,0,0,0,1


In [16]:
# Export to csv
api_expand_df.to_csv('../Processed_Data/corrected_yelp_api_expand1.csv', index=False)

In [17]:
# Save dataframe image (in case needed for presentations)
dfi.export(api_expand_df.head(), "Images/corrected_yelp_api_expand1.png")

### Parse out categories

In [18]:
# Initial exploration of categories (run sample method multiple times)
api_expand_df['categories'].sample(20)

204079    [{'alias': 'tapas', 'title': 'Tapas Bars'}, {'...
202615           [{'alias': 'italian', 'title': 'Italian'}]
196111    [{'alias': 'delis', 'title': 'Delis'}, {'alias...
93460                [{'alias': 'pizza', 'title': 'Pizza'}]
76441            [{'alias': 'seafood', 'title': 'Seafood'}]
11680     [{'alias': 'chickenshop', 'title': 'Chicken Sh...
49501     [{'alias': 'diners', 'title': 'Diners'}, {'ali...
3701             [{'alias': 'mexican', 'title': 'Mexican'}]
163039               [{'alias': 'pizza', 'title': 'Pizza'}]
153146    [{'alias': 'grocery', 'title': 'Grocery'}, {'a...
24717            [{'alias': 'mexican', 'title': 'Mexican'}]
67432               [{'alias': 'bbq', 'title': 'Barbeque'}]
140607    [{'alias': 'ramen', 'title': 'Ramen'}, {'alias...
47193     [{'alias': 'thai', 'title': 'Thai'}, {'alias':...
6880      [{'alias': 'hotdogs', 'title': 'Fast Food'}, {...
9948      [{'alias': 'indpak', 'title': 'Indian'}, {'ali...
2956             [{'alias': 'mexican', '

In [19]:
# Make function to capture alias column (This code previous tested in other notebooks)
# Also converts to string and gets rid of unwanted string charaters

def convert_cat(data):
    cat_dict = pd.DataFrame({'all_cat':[ast.literal_eval(data)]})
    result1 = cat_dict['all_cat'].apply(lambda x: [d['alias'] for d in x]).astype(str)
    result2 = result1.str.cat(sep=",")
    y = re.sub('\[|\]| |\'', "", result2)
    return y
    

In [20]:
# Make a new dataframe
categories_expanded_df = api_expand_df.copy()

In [21]:
# Create new column for aliases
categories_expanded_df['alias_categories'] = categories_expanded_df["categories"].apply(lambda x: convert_cat(x))

In [22]:
# Check new dataframe
categories_expanded_df.head()

Unnamed: 0,postal_code,City,State,CountyName,2021,latitude,longitude,review_count,rating,categories,...,stars_1.0,stars_1.5,stars_2.0,stars_2.5,stars_3.0,stars_3.5,stars_4.0,stars_4.5,stars_5.0,alias_categories
0,60657.0,Chicago,IL,Cook County,507204.0,41.93643,-87.66141,354,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0,0,0,0,0,0,1,0,0,"bars,newamerican"
1,60657.0,Chicago,IL,Cook County,507204.0,41.94015,-87.65386,212,4.5,"[{'alias': 'wine_bars', 'title': 'Wine Bars'},...",...,0,0,0,0,0,0,0,1,0,"wine_bars,italian,tapasmallplates"
2,60657.0,Chicago,IL,Cook County,507204.0,41.93742,-87.6483,60,5.0,"[{'alias': 'pizza', 'title': 'Pizza'}]",...,0,0,0,0,0,0,0,0,1,pizza
3,60657.0,Chicago,IL,Cook County,507204.0,41.942829,-87.649185,593,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0,0,0,0,0,0,1,0,0,"bars,newamerican,breakfast_brunch"
4,60657.0,Chicago,IL,Cook County,507204.0,41.947133,-87.646892,35,5.0,"[{'alias': 'mexican', 'title': 'Mexican'}]",...,0,0,0,0,0,0,0,0,1,mexican


In [23]:
categories_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240589 entries, 0 to 240588
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   postal_code             240589 non-null  float64
 1   City                    240589 non-null  object 
 2   State                   240589 non-null  object 
 3   CountyName              240589 non-null  object 
 4   2021                    240589 non-null  float64
 5   latitude                240583 non-null  float64
 6   longitude               240583 non-null  float64
 7   review_count            240589 non-null  int64  
 8   rating                  240589 non-null  float64
 9   categories              240589 non-null  object 
 10  delivery                240589 non-null  int64  
 11  pickup                  240589 non-null  int64  
 12  restaurant_reservation  240589 non-null  int64  
 13  price_value_1.0         240589 non-null  int64  
 14  price_value_2.0     

In [24]:
# Explore categories
categories_expanded_df['alias_categories'].sample(20)

117466                                     mexican
105714    tradamerican,sportsbars,breakfast_brunch
175031                                     chinese
41099                                      mexican
127099                                       steak
96330                      wine_bars,beerbar,tapas
43249                          dimsum,cocktailbars
20852                              mexican,hotdogs
197635               bowling,lounges,chicken_wings
138005                           sushi,asianfusion
231583                                       pizza
25942                     brasseries,french,coffee
18759                                   newmexican
28577                                        delis
15482                                   salvadoran
115848                                    peruvian
34464                        ramen,bubbletea,sushi
230522        tradamerican,coffee,breakfast_brunch
54149                          salad,mediterranean
48995                       spo

In [25]:
# Make list of ALL categories for ALL businesses (unlikely to need this but just in case)
raw_categories_list = categories_expanded_df['alias_categories'].to_list()

In [26]:
raw_categories_list

['bars,newamerican',
 'wine_bars,italian,tapasmallplates',
 'pizza',
 'bars,newamerican,breakfast_brunch',
 'mexican',
 'wine_bars,mediterranean,newamerican',
 'sandwiches,pubs,newamerican',
 'newamerican,breweries',
 'cajun,seafood,desserts',
 'cafes,venezuelan',
 'breakfast_brunch,coffee,comfortfood',
 'tradamerican,breakfast_brunch,cocktailbars',
 'breakfast_brunch,bagels,salad',
 'sandwiches,burgers,salad',
 'seafood',
 'korean,chicken_wings',
 'french,comfortfood,desserts',
 'newamerican,fishnchips,burgers',
 'korean,asianfusion,breakfast_brunch',
 'italian',
 'ramen',
 'hotdogs,sandwiches,burgers',
 'pizza,pubs,tradamerican',
 'filipino,newamerican,cocktailbars',
 'seafood,wine_bars,beerbar',
 'newamerican,breakfast_brunch',
 'pizza,italian',
 'pubs,irish',
 'breweries,newamerican',
 'bars,pizza,breakfast_brunch',
 'burgers',
 'mediterranean,mideastern',
 'tradamerican,bars,venues',
 'italian',
 'japanese,thai,chinese',
 'mediterranean,wine_bars',
 'vietnamese,cocktailbars,wine_b

In [27]:
# Find all unique categories and count occurances
unique_categories = {}

for index,row in categories_expanded_df.iterrows():
    types_list = categories_expanded_df['alias_categories'][index].split(",")
    for category in types_list:
        if category not in unique_categories:
            unique_categories[category]=1
        elif category in unique_categories:
            unique_categories[category]+=1

In [28]:
len(unique_categories)

520

In [29]:
# Create dataframe of unique categories list with count
all_unique_cat_df = pd.DataFrame.from_dict(unique_categories, orient='index')
all_unique_cat_df.head()

Unnamed: 0,0
bars,13166
newamerican,15980
wine_bars,3741
italian,13440
tapasmallplates,1568


In [30]:
# Rename count column and sort
all_unique_cat_df.rename(columns={0:"Count"}, inplace=True)
all_unique_cat_df.sort_values(by=["Count"], ascending=False, inplace=True)

In [31]:
all_unique_cat_df

Unnamed: 0,Count
tradamerican,34025
pizza,26642
sandwiches,26485
burgers,25779
mexican,25001
...,...
adultentertainment,1
golfequipment,1
headshops,1
surfshop,1


In [32]:
# Make list of all unique categories for manual filtering
unique_categories_list = unique_categories.keys()
unique_categories_list

dict_keys(['bars', 'newamerican', 'wine_bars', 'italian', 'tapasmallplates', 'pizza', 'breakfast_brunch', 'mexican', 'mediterranean', 'sandwiches', 'pubs', 'breweries', 'cajun', 'seafood', 'desserts', 'cafes', 'venezuelan', 'coffee', 'comfortfood', 'tradamerican', 'cocktailbars', 'bagels', 'salad', 'burgers', 'korean', 'chicken_wings', 'french', 'fishnchips', 'asianfusion', 'ramen', 'hotdogs', 'filipino', 'beerbar', 'irish', 'mideastern', 'venues', 'japanese', 'thai', 'chinese', 'vietnamese', 'bbq', 'gastropubs', 'noodles', 'sushi', 'himalayan', 'indpak', 'chickenshop', 'australian', 'brazilian', 'japacurry', 'brewpubs', 'hotdog', 'tex-mex', 'conveyorsushi', 'sportsbars', 'danceclubs', 'modern_european', 'georgian', 'vegan', 'southern', 'turkish', 'kebab', 'steak', 'scandinavian', 'colombian', 'juicebars', 'grocery', 'cupcakes', 'karaoke', 'caribbean', 'creperies', 'falafel', 'greek', 'tacos', 'lounges', 'uzbek', 'latin', 'bakeries', 'hawaiian', 'diners', 'popuprestaurants', 'scottish'

#### Manually filter for restaurant related categories

Primary activity of business should be serving food and drink

Non-food/drink related categories removed: 'venues', 'danceclubs', 'karaoke', 'lounges','musicvenues', 'eventplanning', 'festivals', 'comedyclubs',  'jazzandblues', 'arcades', 'bowling', 'sports_clubs', 'boattours', 'convenience', 'publicmarkets', '', 'tabletopgames', 'golf', 'nightlife', 'poolhalls', 'giftshops', 'social_clubs', 'videogamestores', 'airportlounges', 'bike_repair_maintenance', 'sportswear', 'nonprofit', 'movietheaters', 'dinnertheater', 'markets', 'popupshops', 'museums', 'arts', 'theater', 'culturalcenter',  'organic_stores', 'aquariums', 'vinyl_records', 'hotels', 'furniture', 'florists', 'photographystores', 'dog_parks', 'hardware', 'bookstores', 'cigarbars', 'servicestations', 'galleries', 'mini_golf', 'yoga', 'massage_therapy', 'nutritionists', 'jewelry', 'vacation_rentals', 'bartenders', 'tours', 'pickleball', 'boating', 'marinas', 'gyms', 'thrift_stores', 'laundromat', 'shoppingcenters', 'beautysvc', 'barbers', 'personal_injury', 'workerscomplaw', 'businesslawyers', 'stadiumsarenas', 'countrydancehalls', 'bikerentals', 'surfing', 'churches', 'menscloth', 'autorepair', 'drugstores', 'sportsbetting', 'football', 'eventservices', 'media', 'bedbreakfast', 'axethrowing', 'carwash', 'indoor_playcenter', 'casinos', 'swimmingpools', 'medcenters', 'paintyourownpottery', 'antiques', 'paintandsip', 'playgrounds', 'vintage', 'horseracing', 'suppliesrestaurant', 'tobaccoshops', 'homedecor', 'waterstores', 'amusementparks', 'petadoption', 'wedding_planning', 'djs', 'specialed', 'escapegames', 'localflavor', 'sharedofficespaces', 'pharmacy', 'weightlosscenters', 'spas', 'toys', 'kids_activities', 'countryclubs', 'cabaret', 'veteransorganizations', 'musicvideo', 'hobbyshops', 'autopartssupplies', 'fleamarkets', 'artclasses', 'bikes', 'rodeo', 'facepainting', 'graphicdesign', 'adultentertainment', 'artschools', 'kitchensupplies', 'permanentmakeup', 'hair', 'waxing', 'accessories', 'partyequipmentrentals', 'poolbilliards', 'rafting', 'attractionfarms', 'flowers', 'parks', 'hotelstravel', 'recreation', 'contractors', 'painters', 'resorts', 'driveintheater', 'bocceball', 'artsandcrafts', 'discgolf', 'triviahosts', 'gardening', 'herbsandspices', 'mobilephonerepair', 'battingcages', 'partybusrentals', 'landmarks', 'tennis', 'animalshelters', 'gun_ranges', 'guns_and_ammo', 'collegeuniv', 'historicaltours', 'vitaminssupplements', 'skincare', 'fitness', 'campgrounds', 'framing', 'guesthouses', 'realestate', 'golflessons', 'magicians', 'evchargingstations', 'womenscloth', 'wholesale_stores', 'candlestores', 'communitycenters', 'buildingsupplies', 'grillservices', 'healthcoach', 'auctionhouses', 'personal_shopping', 'kitchenincubators', 'partysupplies', 'custommerchandise', 'usedbooks', 'oilchange', 'rvparks', 'gokarts', 'parasailing', 'comicbooks', 'beaches', 'fishing', 'virtualrealitycenters', 'massage', 'specialtyschools', 'skishops', 'huntingfishingsupplies', 'surfshop', 'headshops', 'pettingzoos', 'golfequipment', 'boatcharters', 'artmuseums', 'floraldesigners', 'stationery', 'lasertag', 'skatingrinks', 'ranches', 'bridal', 'auto_detailing', 'religiousitems', 'shopping', 'saunas', 'halotherapy', 'petstore', 'psychic_astrology', 'souvenirs', 'livestocksupply', 'musicians', 'bingo', 'rvrental', 'financialadvising', 'lakes', 'libraries', 'plumbing', 'hvac', 'yelpevents', 'baseballfields', 'pickyourown', 'musicalinstrumentsandteachers', 'sandblasting', 'othersalons', 'parking', 'videoandgames', 'hiking', 'wildlifehunting', 'deptstores', 'motorcyclinggear', 'petservices', 'movers', 'reststops', 'fueldocks', 'banks', 'realestatesvcs', 'outdoorgear', 'costumes', 'truckrepair', 'elementaryschools', 'highschools', 'postoffices', 'cosmetics', 'tanning', 'active', 'musicinstrumentservices', 'landscaping', 'motorcycledealers', 'skateshops', 'holidaydecorations', 'tires', 'skiresorts', 'sportgoods', 'wholesalers', 'mags', 'hotsprings', 'international', 'walkingtours', 'propane', 'ticketsales', 'grillingequipment', 'homeandgarden', 'jetskis', 'hostels', 'atvrentals', 'archery', 'sledding', 'tcm', 'vapeshops', 'commercialrealestate', 'healthtrainers' 

Non-restaurant food/drink items removed (too generic or seems like grocery stores): 'grocery', 'restaurants', 'catering', 'food_court', 'cookingschools', 'fooddeliveryservices', 'butcher', 'meats', 'beverage_stores', 'healthmarkets', 'cafeteria', 'hookah_bars', 'personalchefs', 'intlgrocery', 'importedfood', 'seafoodmarkets', 'farmersmarket', 'winetastingroom', 'cookingclasses', 'winetasteclasses', 'food', 'candy', 'chocolate', 'farms', 'winetours', 'oliveoil', 'honey', 'popcorn', 'cheesetastingclasses', 'beertours', 'gamemeat', 'foodtours', 'foodbanks', 'csa', 'brewingsupplies', 

In [33]:
# Create new list of only restaurant/bar related categories (must serve food/drink as primary activity)
restaurant_categories_list=['bars', 'newamerican', 'wine_bars', 'italian', 'tapasmallplates', 'pizza',\
                            'breakfast_brunch', 'mexican', 'mediterranean', 'sandwiches', 'pubs', 'breweries',\
                            'cajun', 'seafood', 'desserts', 'cafes', 'venezuelan', 'coffee', 'comfortfood',\
                            'tradamerican', 'cocktailbars', 'bagels', 'salad', 'burgers', 'korean', 'chicken_wings',\
                            'french', 'fishnchips', 'asianfusion', 'ramen', 'hotdogs', 'filipino', 'beerbar', 'irish',\
                            'mideastern', 'japanese', 'thai', 'chinese', 'vietnamese', 'bbq', 'gastropubs', 'noodles',\
                            'sushi', 'himalayan', 'indpak', 'chickenshop', 'australian', 'brazilian', 'japacurry',\
                            'brewpubs', 'hotdog', 'tex-mex', 'conveyorsushi', 'sportsbars', 'modern_european',\
                            'georgian', 'vegan', 'southern', 'turkish', 'kebab', 'steak', 'scandinavian',\
                            'colombian', 'juicebars', 'cupcakes', 'caribbean', 'creperies', 'falafel', 'greek',\
                            'tacos', 'uzbek', 'latin', 'bakeries', 'hawaiian', 'diners', 'popuprestaurants',\
                            'scottish', 'vegetarian', 'soup', 'kosher', 'dimsum', 'cantonese', 'beergardens',\
                            'szechuan', 'whiskeybars', 'spanish', 'portuguese', 'southafrican', 'hotpot',\
                            'puertorican', 'tapas', 'malaysian', 'singaporean', 'wraps', 'waffles', 'lebanese',\
                            'bubbletea', 'taiwanese', 'raw_food', 'halal', 'gluten_free', 'delis', 'cuban',\
                            'german', 'argentine', 'beer_and_wine', 'gaybars', 'pancakes', 'donuts', 'supperclubs',\
                            'foodtrucks', 'poke', 'peruvian', 'russian', 'british', 'indonesian', 'icecream',\
                            'pastashops', 'shavedice', 'gourmet', 'cideries', 'panasian', 'brasseries','empanadas',\
                            'irish_pubs', 'polish', 'divebars', 'pakistani', 'izakaya', 'hainan', 'ukrainian',\
                            'soulfood', 'cheesesteaks', 'newmexican', 'persian', 'tikibars', 'gelato', 'tuscan',\
                            'themedcafes', 'eatertainment', 'moroccan', 'foodstands', 'salvadoran', 'dominican',\
                            'drivethrubars', 'smokehouse', 'african', 'buffets', 'diyfood', 'streetvendors',\
                            'shanghainese', 'tea', 'cakeshop','teppanyaki', 'haitian', 'afghani', 'customcakes',\
                            'honduran', 'austrian', 'srilankan', 'burmese', 'wineries', 'hkcafe', 'mongolian',\
                            'fondue', 'acaibowls', 'ethiopian', 'meaderies', 'armenian', 'pianobars', 'cheese',\
                            'coffeeroasteries', 'basque', 'somali', 'sicilian', 'distilleries','laotian',\
                            'egyptian', 'pretzels', 'internetcafe', 'macarons', 'cambodian', 'belgian',\
                            'eritrean', 'champagne_bars', 'nicaraguan', 'poutineries', 'speakeasies', 'syrian',\
                            'arabian', 'polynesian', 'catalan', 'trinidadian', 'hungarian', 'calabrian', 'czech',\
                            'guamanian', 'sardinian', 'senegalese','bangladeshi', 'bulgarian', 'kombucha',\
                            'iberian', 'piadina', 'vermouthbars', 'shavedsnow','pubfood', 'bistros','modern_australian',\
                            'rotisserie_chicken', 'slovakian']

In [34]:
len(restaurant_categories_list)

211

### Count categories for each restaurant

In [35]:
# Make columns for all restaurant type categories
categories_expanded_df[restaurant_categories_list]=0

In [36]:
# Explore
categories_expanded_df.head()

Unnamed: 0,postal_code,City,State,CountyName,2021,latitude,longitude,review_count,rating,categories,...,kombucha,iberian,piadina,vermouthbars,shavedsnow,pubfood,bistros,modern_australian,rotisserie_chicken,slovakian
0,60657.0,Chicago,IL,Cook County,507204.0,41.93643,-87.66141,354,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0,0,0,0,0,0,0,0,0,0
1,60657.0,Chicago,IL,Cook County,507204.0,41.94015,-87.65386,212,4.5,"[{'alias': 'wine_bars', 'title': 'Wine Bars'},...",...,0,0,0,0,0,0,0,0,0,0
2,60657.0,Chicago,IL,Cook County,507204.0,41.93742,-87.6483,60,5.0,"[{'alias': 'pizza', 'title': 'Pizza'}]",...,0,0,0,0,0,0,0,0,0,0
3,60657.0,Chicago,IL,Cook County,507204.0,41.942829,-87.649185,593,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0,0,0,0,0,0,0,0,0,0
4,60657.0,Chicago,IL,Cook County,507204.0,41.947133,-87.646892,35,5.0,"[{'alias': 'mexican', 'title': 'Mexican'}]",...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Explore
categories_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240589 entries, 0 to 240588
Columns: 239 entries, postal_code to slovakian
dtypes: float64(5), int64(219), object(5), uint8(10)
memory usage: 422.6+ MB


In [38]:
# Number of columns
categories_expanded_df.columns

Index(['postal_code', 'City', 'State', 'CountyName', '2021', 'latitude',
       'longitude', 'review_count', 'rating', 'categories',
       ...
       'kombucha', 'iberian', 'piadina', 'vermouthbars', 'shavedsnow',
       'pubfood', 'bistros', 'modern_australian', 'rotisserie_chicken',
       'slovakian'],
      dtype='object', length=239)

In [39]:
# Count categories for each restaurant
# Use iterrow to add 1 to columns where category exists:
for index,row in categories_expanded_df.iterrows():
    types_list = categories_expanded_df["alias_categories"][index].split(",")
    for category in restaurant_categories_list:
        if category in types_list:
            categories_expanded_df[category][index]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_expanded_df[category][index]=1


In [40]:
# Confirm with 'bars' - should be 13166
categories_expanded_df.bars.value_counts()

0    227423
1     13166
Name: bars, dtype: int64

In [41]:
# Confirm with 'pizza' - should be 26642
categories_expanded_df.pizza.value_counts()

0    213947
1     26642
Name: pizza, dtype: int64

In [43]:
# Confirm with 'mexican' - should be 25001
categories_expanded_df.mexican.value_counts()

0    215588
1     25001
Name: mexican, dtype: int64

In [44]:
# Export to csv
categories_expanded_df.to_csv('../Processed_Data/corrected_yelp_api_categories_expanded.csv', index=False)

### Expand stars for each restaurant category type

In [45]:
# Make new dataframe for expansion
stars_categories_expanded_df = categories_expanded_df.copy()

In [46]:
# Add an additional column for each restaurant category
for category in restaurant_categories_list:
    stars_categories_expanded_df[f'{category}_stars']=0.0

In [47]:
len(stars_categories_expanded_df.columns)

450

In [48]:
stars_categories_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240589 entries, 0 to 240588
Columns: 450 entries, postal_code to slovakian_stars
dtypes: float64(216), int64(219), object(5), uint8(10)
memory usage: 809.9+ MB


In [49]:
stars_categories_expanded_df.bars_stars.dtype

dtype('float64')

In [50]:
stars_categories_expanded_df.head()

Unnamed: 0,postal_code,City,State,CountyName,2021,latitude,longitude,review_count,rating,categories,...,kombucha_stars,iberian_stars,piadina_stars,vermouthbars_stars,shavedsnow_stars,pubfood_stars,bistros_stars,modern_australian_stars,rotisserie_chicken_stars,slovakian_stars
0,60657.0,Chicago,IL,Cook County,507204.0,41.93643,-87.66141,354,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,60657.0,Chicago,IL,Cook County,507204.0,41.94015,-87.65386,212,4.5,"[{'alias': 'wine_bars', 'title': 'Wine Bars'},...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60657.0,Chicago,IL,Cook County,507204.0,41.93742,-87.6483,60,5.0,"[{'alias': 'pizza', 'title': 'Pizza'}]",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,60657.0,Chicago,IL,Cook County,507204.0,41.942829,-87.649185,593,4.0,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60657.0,Chicago,IL,Cook County,507204.0,41.947133,-87.646892,35,5.0,"[{'alias': 'mexican', 'title': 'Mexican'}]",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# Fill in stars for each restaurant type column if "1" in restaurant types
for index,row in stars_categories_expanded_df.iterrows():
    types_list = stars_categories_expanded_df["alias_categories"][index].split(",")
    for category in restaurant_categories_list:
        if category in types_list:
            stars_categories_expanded_df[f'{category}_stars'][index]=stars_categories_expanded_df['rating'][index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stars_categories_expanded_df[f'{category}_stars'][index]=stars_categories_expanded_df['rating'][index]


In [52]:
# Confirm with 1st entry
stars_categories_expanded_df.bars_stars[0]

4.0

In [53]:
# Confirm with 2nd entry
stars_categories_expanded_df.wine_bars_stars[1]

4.5

In [54]:
# Confirm with 3rd entry
stars_categories_expanded_df.pizza_stars[2]

5.0

In [55]:
# Remove original categories column
stars_categories_expanded_df.drop(columns=['categories'], axis=1, inplace=True)

In [56]:
stars_categories_expanded_df.columns

Index(['postal_code', 'City', 'State', 'CountyName', '2021', 'latitude',
       'longitude', 'review_count', 'rating', 'delivery',
       ...
       'kombucha_stars', 'iberian_stars', 'piadina_stars',
       'vermouthbars_stars', 'shavedsnow_stars', 'pubfood_stars',
       'bistros_stars', 'modern_australian_stars', 'rotisserie_chicken_stars',
       'slovakian_stars'],
      dtype='object', length=449)

In [57]:
columns_list = stars_categories_expanded_df.columns.to_list()
columns_list

['postal_code',
 'City',
 'State',
 'CountyName',
 '2021',
 'latitude',
 'longitude',
 'review_count',
 'rating',
 'delivery',
 'pickup',
 'restaurant_reservation',
 'price_value_1.0',
 'price_value_2.0',
 'price_value_3.0',
 'price_value_4.0',
 'stars_0.0',
 'stars_1.0',
 'stars_1.5',
 'stars_2.0',
 'stars_2.5',
 'stars_3.0',
 'stars_3.5',
 'stars_4.0',
 'stars_4.5',
 'stars_5.0',
 'alias_categories',
 'bars',
 'newamerican',
 'wine_bars',
 'italian',
 'tapasmallplates',
 'pizza',
 'breakfast_brunch',
 'mexican',
 'mediterranean',
 'sandwiches',
 'pubs',
 'breweries',
 'cajun',
 'seafood',
 'desserts',
 'cafes',
 'venezuelan',
 'coffee',
 'comfortfood',
 'tradamerican',
 'cocktailbars',
 'bagels',
 'salad',
 'burgers',
 'korean',
 'chicken_wings',
 'french',
 'fishnchips',
 'asianfusion',
 'ramen',
 'hotdogs',
 'filipino',
 'beerbar',
 'irish',
 'mideastern',
 'japanese',
 'thai',
 'chinese',
 'vietnamese',
 'bbq',
 'gastropubs',
 'noodles',
 'sushi',
 'himalayan',
 'indpak',
 'chicke

In [58]:
# Export to csv
stars_categories_expanded_df.to_csv('../Processed_Data/corrected_yelp_api_stars_categories_expanded.csv', index=False)

### Export to local SQL for grouping by zip code

In [59]:
# Export to local SQL

# Dependencies
import psycopg2
import sqlalchemy as sqla
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

# Export
from config import pSQL
# DB password in config.py file

db_path = f'postgresql://postgres:{pSQL}@127.0.0.1:5432/Yelp2' # last item is name of db in the server group
# Create database engine
engine = create_engine(db_path)

In [60]:
# Load last dataframe into local SQL
stars_categories_expanded_df.to_sql(name='Correct_Stars_Cat_Expanded',con=engine)

#### Additional Processing in SQL:
- grouped restaurants by zip code

**Tables generated**
1. summary table (rating averaged, all dummy columns counted) by zip code without expanded categories and category_stars columns
2. count of restaurant types by zip code
3. sum of stars for each category_star column by zip code

In [63]:
# Check number of unique zip codes
len(stars_categories_expanded_df.postal_code.unique())
# This result matches SQL after merging data on zip codes

13988

In [65]:
# Load in SQL tables

connection = engine.connect()

zip_summary_df = pd.read_sql("select * from \"corr_zip_summary\"", connection)

zip_categories_df = pd.read_sql("select * from \"corr_zip_categories_sum\"", connection)

zip_stars_df = pd.read_sql("select * from \"corr_zip_stars_sum\"", connection)

In [66]:
# Display zip_summary
zip_summary_df.head()

Unnamed: 0,postal_code,City,State,CountyName,2021,total_restaurants,total_reviews,avg_rating,total_delivery,total_pickup,...,total_0.0_stars,total_1.0_stars,total_1.5_stars,total_2.0_stars,total_2.5_stars,total_3.0_stars,total_3.5_stars,total_4.0_stars,total_4.5_stars,total_5.0_stars
0,1267.0,Williamstown,MA,Berkshire County,357029.0,23,1347.0,3.782609,1.0,2.0,...,0.0,1.0,0.0,0.0,0.0,2.0,5.0,11.0,3.0,1.0
1,2134.0,Boston,MA,Suffolk County,620830.0,1,2.0,4.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2886.0,Warwick,RI,Kent County,315398.0,1,28.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3079.0,Salem,NH,Rockingham County,458081.0,1,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3276.0,Northfield,NH,Merrimack County,283171.0,1,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
# Display zip_categories
zip_categories_df.head()

Unnamed: 0,postal_code,bars,newamerican,wine_bars,italian,tapasmallplates,pizza,breakfast_brunch,mexican,mediterranean,...,kombucha,iberian,piadina,vermouthbars,shavedsnow,pubfood,bistros,modern_australian,rotisserie_chicken,slovakian
0,1267.0,2.0,0.0,0.0,0.0,0.0,4.0,3.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3079.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3276.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
# Display zip_stars (sum of all stars for each category_star column)
zip_stars_df.head()

Unnamed: 0,postal_code,bars_stars,newamerican_stars,wine_bars_stars,italian_stars,tapasmallplates_stars,pizza_stars,breakfast_brunch_stars,mexican_stars,mediterranean_stars,...,kombucha_stars,iberian_stars,piadina_stars,vermouthbars_stars,shavedsnow_stars,pubfood_stars,bistros_stars,modern_australian_stars,rotisserie_chicken_stars,slovakian_stars
0,1267.0,7.0,0.0,0.0,0.0,0.0,16.0,11.0,11.5,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3079.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3276.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating final dataframes for ML notebooks

In [69]:
# Reset indexes to postal code
zip_summary_df.set_index(["postal_code"], inplace=True)
zip_categories_df.set_index(["postal_code"], inplace=True)
zip_stars_df.set_index(["postal_code"], inplace=True)

In [74]:
zip_summary_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 13988 entries, 1267.0 to 99833.0
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   City                    13988 non-null  object 
 1   State                   13988 non-null  object 
 2   CountyName              13988 non-null  object 
 3   2021                    13988 non-null  float64
 4   total_restaurants       13988 non-null  int64  
 5   total_reviews           13988 non-null  float64
 6   avg_rating              13988 non-null  float64
 7   total_delivery          13988 non-null  float64
 8   total_pickup            13988 non-null  float64
 9   total_rest_reservation  13988 non-null  float64
 10  total_price_1           13988 non-null  float64
 11  total_price_2           13988 non-null  float64
 12  total_price_3           13988 non-null  float64
 13  total_price_4           13988 non-null  float64
 14  total_0.0_stars         13988

In [75]:
zip_categories_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 13988 entries, 1267.0 to 99833.0
Columns: 211 entries, bars to slovakian
dtypes: float64(211)
memory usage: 22.6 MB


In [76]:
zip_stars_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 13988 entries, 1267.0 to 99833.0
Columns: 211 entries, bars_stars to slovakian_stars
dtypes: float64(211)
memory usage: 22.6 MB


In [77]:
# Create dataframe with averages of stars for each category_star column
zip_avg_stars_df = zip_stars_df/zip_categories_df.values[:,:]

In [78]:
zip_avg_stars_df.head()

Unnamed: 0_level_0,bars_stars,newamerican_stars,wine_bars_stars,italian_stars,tapasmallplates_stars,pizza_stars,breakfast_brunch_stars,mexican_stars,mediterranean_stars,sandwiches_stars,...,kombucha_stars,iberian_stars,piadina_stars,vermouthbars_stars,shavedsnow_stars,pubfood_stars,bistros_stars,modern_australian_stars,rotisserie_chicken_stars,slovakian_stars
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1267.0,3.5,,,,,4.0,3.666667,3.833333,4.0,1.0,...,,,,,,,,,,
2134.0,,,,,,,,,,,...,,,,,,,,,,
2886.0,,,,,,,,,,,...,,,,,,,,,,
3079.0,,,,,,,,,,,...,,,,,,,,,,
3276.0,,2.0,,,,,,,,,...,,,,,,,,,,


In [79]:
zip_avg_stars_df.bars_stars

postal_code
1267.0     3.500000
2134.0          NaN
2886.0          NaN
3079.0          NaN
3276.0          NaN
             ...   
99654.0    3.833333
99669.0    3.500000
99672.0         NaN
99694.0    5.000000
99833.0         NaN
Name: bars_stars, Length: 13988, dtype: float64

In [82]:
# Export categories and avg stars

zip_categories_df.to_csv('../Processed_Data/Final_zip_categories_sum.csv')

zip_avg_stars_df.to_csv('../Processed_Data/Final_zip_cat_stars_avg.csv')

In [83]:
# COUNT total number of different TYPES of restaurants (diversity) for each zip code
# Make a new series
new_data = []

for index,row in zip_categories_df.iterrows():
    counter = 0
    for i in range(len(zip_categories_df.columns)):
        if row[i] != 0:
            counter +=1
    new_data.append(counter)

In [84]:
new_data

[23,
 0,
 1,
 3,
 1,
 21,
 9,
 19,
 11,
 5,
 7,
 8,
 8,
 9,
 2,
 8,
 17,
 10,
 14,
 11,
 2,
 25,
 12,
 11,
 10,
 8,
 2,
 4,
 4,
 2,
 7,
 2,
 15,
 1,
 1,
 11,
 5,
 4,
 12,
 6,
 14,
 1,
 4,
 1,
 7,
 3,
 2,
 0,
 6,
 9,
 7,
 9,
 1,
 8,
 5,
 13,
 20,
 2,
 2,
 1,
 6,
 4,
 7,
 4,
 10,
 7,
 8,
 15,
 5,
 7,
 2,
 0,
 12,
 3,
 1,
 6,
 17,
 2,
 1,
 4,
 11,
 2,
 17,
 2,
 26,
 3,
 3,
 5,
 13,
 2,
 18,
 21,
 8,
 2,
 1,
 4,
 7,
 1,
 33,
 8,
 9,
 66,
 2,
 2,
 34,
 2,
 0,
 3,
 19,
 42,
 1,
 5,
 18,
 11,
 40,
 24,
 0,
 23,
 10,
 9,
 11,
 23,
 3,
 3,
 6,
 1,
 22,
 5,
 2,
 1,
 4,
 3,
 15,
 2,
 3,
 24,
 8,
 3,
 8,
 23,
 19,
 4,
 16,
 11,
 45,
 2,
 7,
 4,
 5,
 3,
 15,
 5,
 19,
 12,
 1,
 2,
 3,
 4,
 9,
 3,
 2,
 2,
 9,
 3,
 7,
 11,
 12,
 6,
 8,
 19,
 3,
 10,
 8,
 2,
 1,
 9,
 8,
 2,
 64,
 12,
 2,
 25,
 14,
 5,
 6,
 4,
 1,
 3,
 3,
 1,
 6,
 15,
 5,
 5,
 2,
 0,
 12,
 6,
 0,
 1,
 10,
 18,
 26,
 3,
 22,
 8,
 25,
 3,
 76,
 96,
 87,
 53,
 43,
 44,
 56,
 78,
 68,
 70,
 70,
 94,
 79,
 81,
 68,
 69,
 85,
 22,
 71,
 80,
 

In [86]:
# Check out why Massachusetts restaurant came up with 0 categories
MA_rest = api_expand_df[api_expand_df['postal_code']==2134.0]

In [87]:
MA_rest

Unnamed: 0,postal_code,City,State,CountyName,2021,latitude,longitude,review_count,rating,categories,...,stars_0.0,stars_1.0,stars_1.5,stars_2.0,stars_2.5,stars_3.0,stars_3.5,stars_4.0,stars_4.5,stars_5.0
113711,2134.0,Boston,MA,Suffolk County,620830.0,-33.88631,151.09624,2,4.5,"[{'alias': 'restaurants', 'title': 'Restaurant...",...,0,0,0,0,0,0,0,0,1,0


In [89]:
MA_rest.categories.to_list()

["[{'alias': 'restaurants', 'title': 'Restaurants'}]"]

In [90]:
# Append restaurant type count to zip_summary table
zip_summary_df["num_rest_types"] = new_data

In [92]:
zip_summary_df.head()

Unnamed: 0_level_0,City,State,CountyName,2021,total_restaurants,total_reviews,avg_rating,total_delivery,total_pickup,total_rest_reservation,...,total_1.0_stars,total_1.5_stars,total_2.0_stars,total_2.5_stars,total_3.0_stars,total_3.5_stars,total_4.0_stars,total_4.5_stars,total_5.0_stars,num_rest_types
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1267.0,Williamstown,MA,Berkshire County,357029.0,23,1347.0,3.782609,1.0,2.0,0.0,...,1.0,0.0,0.0,0.0,2.0,5.0,11.0,3.0,1.0,23
2134.0,Boston,MA,Suffolk County,620830.0,1,2.0,4.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2886.0,Warwick,RI,Kent County,315398.0,1,28.0,2.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3079.0,Salem,NH,Rockingham County,458081.0,1,1.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
3276.0,Northfield,NH,Merrimack County,283171.0,1,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [93]:
zip_summary_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 13988 entries, 1267.0 to 99833.0
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   City                    13988 non-null  object 
 1   State                   13988 non-null  object 
 2   CountyName              13988 non-null  object 
 3   2021                    13988 non-null  float64
 4   total_restaurants       13988 non-null  int64  
 5   total_reviews           13988 non-null  float64
 6   avg_rating              13988 non-null  float64
 7   total_delivery          13988 non-null  float64
 8   total_pickup            13988 non-null  float64
 9   total_rest_reservation  13988 non-null  float64
 10  total_price_1           13988 non-null  float64
 11  total_price_2           13988 non-null  float64
 12  total_price_3           13988 non-null  float64
 13  total_price_4           13988 non-null  float64
 14  total_0.0_stars         13988

In [94]:
# Export to csv
zip_summary_df.to_csv('../Processed_Data/Final_yelp_zip_summary.csv')