This notebook takes the merged Yelp and Census Tract data frame and calculates the frequency of restaurant counts for each price range (including those Missing) for each Census tract. 
Using GEOID (census tract identifier), I merged household income distribution data for each census tract (in percentages), then calculate the actual number of households for each income range.


In [1]:
# Setting up modules
import geopandas as gpd
from geopandas import GeoDataFrame
import numpy as np
import pandas as pd
from shapely.geometry import Point
import matplotlib.pylab as plt

# Data path 
andrew_path = '/Users/andrewnorris/restaurant-scene-ads/'
data_path = "./data"
Yelp_BK_path = data_path +  "/Yelp/BK/"
Yelp_MN_path = data_path + "/Yelp/MN/"

In [2]:
# Reading in shapefiles
BK_Yelp = gpd.read_file(Yelp_BK_path + "BK_Yelp_CensusTract_NTA.shp")
MN_Yelp = gpd.read_file(Yelp_MN_path + "MN_Yelp_CensusTract_NTA.shp")
MN_Yelp.head()

Unnamed: 0,id,alias,name,is_closed,review_cou,rating,price,categories,latitude,longitude,...,NTAName,NTA_CODE,0-25k,25-50k,50-75k,75-100k,100-125k,125-150k,> 150k,geometry
0,ksksxd8J2SIs97ccPFV75A,brown-sugar-restaurant-new-york,Brown Sugar Restaurant,0,204,3.5,$$,"cuban,seafood,steak",40.869926,-73.915466,...,Marble Hill-Inwood,MN01,0.32,0.21,0.15,0.14,0.07,0.04,0.07,POINT (-73.91546600000001 40.869926)
1,IHGm6huN_Z48KdorBjztSQ,guacamole-taqueria-new-york,Guacamole Taqueria,0,122,3.5,$$,mexican,40.86952,-73.91674,...,Marble Hill-Inwood,MN01,0.32,0.21,0.15,0.14,0.07,0.04,0.07,POINT (-73.91674 40.86952)
2,rUBFgZU3QTk7IOgpccE8Og,indian-road-cafe-new-york,Indian Road Cafe,0,580,4.0,$$,"coffee,newamerican,bars",40.872915,-73.918441,...,Marble Hill-Inwood,MN01,0.32,0.21,0.15,0.14,0.07,0.04,0.07,POINT (-73.91844082 40.87291516)
3,AflnoQBr01QmQIq5hVH-iA,la-essencia-restaurant-new-york,La Essencia Restaurant,0,13,3.5,$$,"mexican,dominican",40.87087,-73.91505,...,Marble Hill-Inwood,MN01,0.32,0.21,0.15,0.14,0.07,0.04,0.07,POINT (-73.91504999999999 40.87087)
4,8o-B_1q4XB48CmdaXdm2KQ,raices-new-york,Raices,0,127,3.5,$$,"bars,latin",40.86633,-73.92016,...,Marble Hill-Inwood,MN01,0.32,0.21,0.15,0.14,0.07,0.04,0.07,POINT (-73.92016 40.86633)


In [26]:
# Coerce into a dataframe 
BK_Yelp_df = pd.DataFrame(BK_Yelp)
MN_Yelp_df = pd.DataFrame(MN_Yelp)

# Check colnames
#BK_Yelp_df.columns

In [5]:
# The groupby for BK and MN are different because the joins that were called in the NYC Census Tracts notebook were different
# Treats dollar signs as a categorical variable and then count the number of restaurants for each price type in each NTA
BK_price_counts = pd.DataFrame(BK_Yelp_df.groupby(['NTACode', 'price'])['price'].count())
MN_price_counts = pd.DataFrame(MN_Yelp_df.groupby(["NTA_CODE", "price"])["price"].count())
print(BK_price_counts.head(10))
print(MN_price_counts.head(10))

                 price
NTACode price         
BK09    $           29
        $$          46
        $$$          6
        MISSING     16
BK17    $           35
        $$          54
        $$$         10
        $$$$         2
        MISSING     21
BK19    $           22
                  price
NTA_CODE price         
MN01     $           35
         $$          41
         MISSING     25
MN03     $           57
         $$          45
         $$$          1
         $$$$         1
         MISSING     36
MN04     $           50
         $$          31


In [6]:
BK_price_counts = BK_price_counts.unstack(level='price', fill_value=0).reset_index()
MN_price_counts = MN_price_counts.unstack(level='price', fill_value=0).reset_index()
BK_price_counts.head(10)

Unnamed: 0_level_0,NTACode,price,price,price,price,price
price,Unnamed: 1_level_1,$,$$,$$$,$$$$,MISSING
0,BK09,29,46,6,0,16
1,BK17,35,54,10,2,21
2,BK19,22,17,4,1,14
3,BK21,24,12,0,0,14
4,BK23,6,4,2,0,4
5,BK25,29,41,10,1,24
6,BK26,17,3,0,0,12
7,BK27,16,22,1,0,14
8,BK28,79,49,1,0,50
9,BK29,42,29,1,1,31


In [8]:
#bkcounts.columns = ['_'.join(col) for col in bkcounts.columns]
BK_price_counts.columns = ['NTACode','price_1', 'price_2', 'price_3', 'price_4','MISSING']
MN_price_counts.columns = ['NTACode','price_1', 'price_2', 'price_3', 'price_4','MISSING']
MN_price_counts.head(10)

Unnamed: 0,NTACode,price_1,price_2,price_3,price_4,MISSING
0,MN01,35,41,0,0,25
1,MN03,57,45,1,1,36
2,MN04,50,31,0,0,27
3,MN06,15,14,0,0,15
4,MN09,61,74,2,1,23
5,MN11,35,64,5,1,22
6,MN12,76,176,14,2,64
7,MN13,158,328,63,25,183
8,MN14,32,68,24,10,27
9,MN15,112,263,37,9,90


In [9]:
# Creating a new df that replaces absolute counts with relative percentage counts 
BK_price_counts_pct = BK_price_counts.copy()
BK_price_counts_pct.iloc[:, 1:] = BK_price_counts_pct.iloc[:, 1:].div(BK_price_counts_pct.iloc[:, 1:].sum(axis=1), axis=0)
MN_price_counts_pct = MN_price_counts.copy()
MN_price_counts_pct.iloc[:, 1:] = MN_price_counts_pct.iloc[:, 1:].div(MN_price_counts_pct.iloc[:, 1:].sum(axis=1), axis=0)

# Checking that nothing has messed up 
#print(BK_price_counts_pct['NTACode'].nunique())
#print(BK_price_counts_pct.head())

In [28]:
BK_Yelp_df.columns

Index(['id', 'alias', 'name', 'is_closed', 'review_cou', 'rating', 'price',
       'categories', 'latitude', 'longitude', 'address', 'city', 'zipcode',
       'state', 'country', 'index_righ', 'NTACode', 'NTAName', 'Households',
       'pct_0_25k', 'pct_25k_50k', 'pct_50k_75k', 'pct_75k_100k',
       'pct_100k_125k', 'pct_125k_150k', 'pct_> 150k', 'geometry'],
      dtype='object')

In [27]:
BK_Yelp_df = BK_Yelp_df.rename(columns={"0-25k": "pct_0_25k", "25k-50k": "pct_25k_50k", '50k-75k':'pct_50k_75k', \
                                           '75k-100k':'pct_75k_100k', '100k-125k':'pct_100k_125k', '125k-150k':'pct_125k_150k',\
                                           '> 150k':'pct_> 150k'})
MN_Yelp_df = MN_Yelp_df.rename(columns={"0-25k": "pct_0_25k", "25k-50k": "pct_25k_50k", '50k-75k':'pct_50k_75k', \
                                           '75k-100k':'pct_75k_100k', '100k-125k':'pct_100k_125k', '125k-150k':'pct_125k_150k',\
                                           '> 150k':'pct_> 150k'})

In [30]:
# Cannot do this for MN because we do not have the total number of households 
# So I don't think we will need this chunk
#BK_Yelp_df['num_0_25k'] = BK_Yelp_df['Households']*BK_Yelp_df['pct_0_25k']
#BK_Yelp_df['num_25k_50k'] = BK_Yelp_df['Households']*BK_Yelp_df['pct_25k_50k']
#BK_Yelp_df['num_50k_75k'] = BK_Yelp_df['Households']*BK_Yelp_df['pct_50k_75k']
#BK_Yelp_df['num_75k_100k'] = BK_Yelp_df['Households']*BK_Yelp_df['pct_75k_100k']
#BK_Yelp_df['num_100k_125k'] = BK_Yelp_df['Households']*BK_Yelp_df['pct_100k_125k']
#BK_Yelp_df['num_> 150k'] = BK_Yelp_df['Households']*BK_Yelp_df['pct_> 150k']
#BK_Yelp_df.head()

Unnamed: 0,id,alias,name,is_closed,review_cou,rating,price,categories,latitude,longitude,...,pct_100k_125k,pct_125k_150k,pct_> 150k,geometry,num_0_25k,num_25k_50k,num_50k_75k,num_75k_100k,num_100k_125k,num_> 150k
0,6gzQLjzJk25ePm_JS7ZAug,esme-brooklyn-2,Esme,0,328,4.5,$$,newamerican|cocktailbars,40.733203,-73.954967,...,0.105529,0.086914,0.182642,POINT (-73.95496677 40.73320339),3248.0,2220.0,2464.0,2206.0,1712.0,2963.0
1,Swjm9no7DRqhThLlf0EHng,sama-street-brooklyn-2,Sama Street,0,58,4.5,$$,cocktailbars|panasian|tapasmallplates,40.73287,-73.95448,...,0.105529,0.086914,0.182642,POINT (-73.95448 40.73287),3248.0,2220.0,2464.0,2206.0,1712.0,2963.0
2,utM-5navObsVA5sCRHobzA,madre-brooklyn-2,Madre,0,38,5.0,MISSING,newamerican,40.73311,-73.95798,...,0.105529,0.086914,0.182642,POINT (-73.95798000000001 40.73311),3248.0,2220.0,2464.0,2206.0,1712.0,2963.0
3,L9SuMN3UsGipopWOe3pr9w,chiko-brooklyn-2,Chiko,0,36,5.0,MISSING,japanese|sushi,40.7319,-73.95422,...,0.105529,0.086914,0.182642,POINT (-73.95421999999998 40.7319),3248.0,2220.0,2464.0,2206.0,1712.0,2963.0
4,vyKBwzRdNX4yiJDIFv37iw,oxomoco-brooklyn-2,Oxomoco,0,247,4.0,$$$,mexican,40.72991,-73.95548,...,0.105529,0.086914,0.182642,POINT (-73.95548000000002 40.7299099),3248.0,2220.0,2464.0,2206.0,1712.0,2963.0


In [21]:
# bkntainc = bk[['NTACode','NTAName', 'Households', 'pct_0-25k',
#        'pct_25k-50k', 'pct_50k-75k', 'pct_75k-100k', 'pct_100k-125k',
#        'pct_125k-150k', 'pct_> 150k', 'latitude','longitude','geometry', 'num_0-25k', 'num_25k-50k',
#        'num_50k-75k', 'num_75k-100k', 'num_100k-125k', 'num_125k-150k',
#        'num_> 150k']].copy()
# bkntainc.head()

Unnamed: 0,NTACode,NTAName,Households,pct_0-25k,pct_25k-50k,pct_50k-75k,pct_75k-100k,pct_100k-125k,pct_125k-150k,pct_> 150k,latitude,longitude,geometry,num_0-25k,num_25k-50k,num_50k-75k,num_75k-100k,num_100k-125k,num_125k-150k,num_> 150k
0,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.733203,-73.954967,POINT (-73.95496677 40.73320339),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
1,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.73287,-73.95448,POINT (-73.95448 40.73287),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
2,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.73311,-73.95798,POINT (-73.95798000000001 40.73311),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
3,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.7319,-73.95422,POINT (-73.95421999999998 40.7319),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
4,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.72991,-73.95548,POINT (-73.95548000000002 40.7299099),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0


In [22]:
# bkntainc = bkntainc.drop_duplicates('NTACode')
# bkntainc.head()

Unnamed: 0,NTACode,NTAName,Households,pct_0-25k,pct_25k-50k,pct_50k-75k,pct_75k-100k,pct_100k-125k,pct_125k-150k,pct_> 150k,latitude,longitude,geometry,num_0-25k,num_25k-50k,num_50k-75k,num_75k-100k,num_100k-125k,num_125k-150k,num_> 150k
0,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.733203,-73.954967,POINT (-73.95496677 40.73320339),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
183,BK90,East Williamsburg,16075,0.284044,0.165723,0.12367,0.116579,0.087403,0.06507,0.157512,40.72103,-73.94074,POINT (-73.94074000000001 40.72103),4566.0,2664.0,1988.0,1874.0,1405.0,1046.0,2532.0
348,BK77,Bushwick North,19367,0.290029,0.207415,0.169515,0.106108,0.084164,0.053441,0.089327,40.707587,-73.933271,POINT (-73.93327099999998 40.707587),5617.0,4017.0,3283.0,2055.0,1630.0,1035.0,1730.0
602,BK78,Bushwick South,26616,0.349451,0.207582,0.152991,0.101743,0.068643,0.044222,0.075368,40.709139,-73.937227,POINT (-73.93722679999998 40.7091391),9301.0,5525.0,4072.0,2708.0,1827.0,1177.0,2006.0
738,BK73,North Side-South Side,23862,0.192566,0.167337,0.113654,0.105859,0.088718,0.070321,0.261546,40.721189,-73.957,POINT (-73.95700045 40.72118902),4595.0,3993.0,2712.0,2526.0,2117.0,1678.0,6241.0


In [23]:
# This is problematic because this is now a point shapefile. Hence, it was corrupted somewhere down the line becase
# NTA shapefiles should be polgons, not points
#bk_price_inc = bkcounts.merge(bkntainc, how='inner', on='NTACode')
#bk_price_inc.head()

Unnamed: 0,NTACode,price_1,price_2,price_3,price_4,MISSING,NTAName,Households,pct_0-25k,pct_25k-50k,...,latitude,longitude,geometry,num_0-25k,num_25k-50k,num_50k-75k,num_75k-100k,num_100k-125k,num_125k-150k,num_> 150k
0,BK09,29,46,6,0,16,Brooklyn Heights-Cobble Hill,11115,0.11507,0.108052,...,40.699268,-73.992311,POINT (-73.99231090000001 40.69926820000001),1279.0,1201.0,777.0,1231.0,1042.0,840.0,4745.0
1,BK17,35,54,10,2,21,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,26150,0.253805,0.2026,...,40.57709,-73.952661,POINT (-73.95266059999999 40.57709038),6637.0,5298.0,3965.0,2832.0,2384.0,1692.0,3342.0
2,BK19,22,17,4,1,14,Brighton Beach,14557,0.395823,0.185272,...,40.57672,-73.96427,POINT (-73.96427 40.57672),5762.0,2697.0,2311.0,990.0,681.0,521.0,1595.0
3,BK21,24,12,0,0,14,Seagate-Coney Island,11236,0.478907,0.203364,...,40.578805,-73.983811,POINT (-73.983811 40.578805),5381.0,2285.0,1447.0,711.0,642.0,240.0,530.0
4,BK23,6,4,2,0,4,West Brighton,8401,0.377217,0.21307,...,40.579194,-73.980654,POINT (-73.9806538 40.5791945),3169.0,1790.0,1250.0,962.0,428.0,266.0,536.0


In [24]:
#bk_price_inc.shape

(51, 25)

In [25]:
#bk_price_inc.to_csv(data_path + '/BK_YelpPriceFreq_NTAIncDist.csv')


In [26]:
# # Creating another csv that has the yelp price rating broken down by percentage
# bk_pct_price_inc = bk_counts_pct.merge(bkntainc, how='inner', on='NTACode')
# bk_pct_price_inc.to_csv(data_path + '/BK_YelpPriceFreqPct_NTAIncDist.csv')
# bk_pct_price_inc.shape

(51, 25)

Actually, what we want to achieve is for each NTA, we will be able to find out how many restaurants of each price range there are, and how reviewed the restaurants are, while preserving its shape type: polygon.  
Hence, we need to merge the price, review, and life expectancy counts with the NTA shapefile. 