This notebook takes the merged Yelp and Census Tract data frame and calculates the frequency of restaurant counts for each price range (including those Missing) for each Census tract. 
Using GEOID (census tract identifier), I merged household income distribution data for each census tract (in percentages), then calculate the actual number of households for each income range.


In [18]:
# Setting up modules
import geopandas as gpd
from geopandas import GeoDataFrame
import numpy as np
import pandas as pd
from shapely.geometry import Point
import matplotlib.pylab as plt

# Data path 
andrew_path = '/Users/andrewnorris/restaurant-scene-ads/'
data_path = "./data"
Yelp_BK_path = data_path +  "/Yelp/BK/"


In [19]:
bk = gpd.read_file(Yelp_BK_path + "BK_Yelp_CensusTract_NTA.shp")
bk.head()

Unnamed: 0,id,alias,name,is_closed,review_cou,rating,price,categories,latitude,longitude,...,NTAName,Households,0-25k,25k-50k,50k-75k,75k-100k,100k-125k,125k-150k,> 150k,geometry
0,6gzQLjzJk25ePm_JS7ZAug,esme-brooklyn-2,Esme,0,328,4.5,$$,newamerican|cocktailbars,40.733203,-73.954967,...,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,POINT (-73.95496677 40.73320339)
1,Swjm9no7DRqhThLlf0EHng,sama-street-brooklyn-2,Sama Street,0,58,4.5,$$,cocktailbars|panasian|tapasmallplates,40.73287,-73.95448,...,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,POINT (-73.95448 40.73287)
2,utM-5navObsVA5sCRHobzA,madre-brooklyn-2,Madre,0,38,5.0,MISSING,newamerican,40.73311,-73.95798,...,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,POINT (-73.95798000000001 40.73311)
3,L9SuMN3UsGipopWOe3pr9w,chiko-brooklyn-2,Chiko,0,36,5.0,MISSING,japanese|sushi,40.7319,-73.95422,...,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,POINT (-73.95421999999998 40.7319)
4,vyKBwzRdNX4yiJDIFv37iw,oxomoco-brooklyn-2,Oxomoco,0,247,4.0,$$$,mexican,40.72991,-73.95548,...,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,POINT (-73.95548000000002 40.7299099)


In [20]:
bk = pd.DataFrame(bk)

In [21]:
bk.columns

Index(['id', 'alias', 'name', 'is_closed', 'review_cou', 'rating', 'price',
       'categories', 'latitude', 'longitude', 'address', 'city', 'zipcode',
       'state', 'country', 'index_righ', 'NTACode', 'NTAName', 'Households',
       '0-25k', '25k-50k', '50k-75k', '75k-100k', '100k-125k', '125k-150k',
       '> 150k', 'geometry'],
      dtype='object')

In [22]:
bkcounts = pd.DataFrame(bk.groupby(['NTACode', 'price'])['price'].count())
bkcounts.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,price
NTACode,price,Unnamed: 2_level_1
BK09,$,29
BK09,$$,46
BK09,$$$,6
BK09,MISSING,16
BK17,$,35
BK17,$$,54
BK17,$$$,10
BK17,$$$$,2
BK17,MISSING,21
BK19,$,22


In [23]:
bkcounts = bkcounts.unstack(level='price', fill_value=0).reset_index()
bkcounts.head(10)

Unnamed: 0_level_0,NTACode,price,price,price,price,price
price,Unnamed: 1_level_1,$,$$,$$$,$$$$,MISSING
0,BK09,29,46,6,0,16
1,BK17,35,54,10,2,21
2,BK19,22,17,4,1,14
3,BK21,24,12,0,0,14
4,BK23,6,4,2,0,4
5,BK25,29,41,10,1,24
6,BK26,17,3,0,0,12
7,BK27,16,22,1,0,14
8,BK28,79,49,1,0,50
9,BK29,42,29,1,1,31


In [24]:
bkcounts.columns = ['_'.join(col) for col in bkcounts.columns]
bkcounts.columns = ['NTACode','price_$', 'price_$$', 'price_$$$', 'price_$$$$','MISSING']
bkcounts.head(10)

Unnamed: 0,NTACode,price_$,price_$$,price_$$$,price_$$$$,MISSING
0,BK09,29,46,6,0,16
1,BK17,35,54,10,2,21
2,BK19,22,17,4,1,14
3,BK21,24,12,0,0,14
4,BK23,6,4,2,0,4
5,BK25,29,41,10,1,24
6,BK26,17,3,0,0,12
7,BK27,16,22,1,0,14
8,BK28,79,49,1,0,50
9,BK29,42,29,1,1,31


In [25]:
# Creating a new df that replaces absolute counts with relative percentage counts 
bk_counts_pct = bkcounts.copy()
bk_counts_pct.iloc[:, 1:] = bk_counts_pct.iloc[:, 1:].div(bk_counts_pct.iloc[:, 1:].sum(axis=1), axis=0)


In [26]:
bkcounts['NTACode'].nunique()

51

In [27]:
bk_counts_pct.head()

Unnamed: 0,NTACode,price_$,price_$$,price_$$$,price_$$$$,MISSING
0,BK09,0.298969,0.474227,0.061856,0.0,0.164948
1,BK17,0.286885,0.442623,0.081967,0.016393,0.172131
2,BK19,0.37931,0.293103,0.068966,0.017241,0.241379
3,BK21,0.48,0.24,0.0,0.0,0.28
4,BK23,0.375,0.25,0.125,0.0,0.25


In [28]:
bk.columns

Index(['id', 'alias', 'name', 'is_closed', 'review_cou', 'rating', 'price',
       'categories', 'latitude', 'longitude', 'address', 'city', 'zipcode',
       'state', 'country', 'index_righ', 'NTACode', 'NTAName', 'Households',
       '0-25k', '25k-50k', '50k-75k', '75k-100k', '100k-125k', '125k-150k',
       '> 150k', 'geometry'],
      dtype='object')

In [29]:
bk = bk.rename(columns={"0-25k": "pct_0-25k", "25k-50k": "pct_25k-50k", '50k-75k':'pct_50k-75k', \
                                           '75k-100k':'pct_75k-100k', '100k-125k':'pct_100k-125k', '125k-150k':'pct_125k-150k',\
                                           '> 150k':'pct_> 150k'})

In [30]:
bk['num_0-25k'] = bk['Households']*bk['pct_0-25k']
bk['num_25k-50k'] = bk['Households']*bk['pct_25k-50k']
bk['num_50k-75k'] = bk['Households']*bk['pct_50k-75k']
bk['num_75k-100k'] = bk['Households']*bk['pct_75k-100k']
bk['num_100k-125k'] = bk['Households']*bk['pct_100k-125k']
bk['num_125k-150k'] = bk['Households']*bk['pct_125k-150k']
bk['num_> 150k'] = bk['Households']*bk['pct_> 150k']
bk.head()

Unnamed: 0,id,alias,name,is_closed,review_cou,rating,price,categories,latitude,longitude,...,pct_125k-150k,pct_> 150k,geometry,num_0-25k,num_25k-50k,num_50k-75k,num_75k-100k,num_100k-125k,num_125k-150k,num_> 150k
0,6gzQLjzJk25ePm_JS7ZAug,esme-brooklyn-2,Esme,0,328,4.5,$$,newamerican|cocktailbars,40.733203,-73.954967,...,0.086914,0.182642,POINT (-73.95496677 40.73320339),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
1,Swjm9no7DRqhThLlf0EHng,sama-street-brooklyn-2,Sama Street,0,58,4.5,$$,cocktailbars|panasian|tapasmallplates,40.73287,-73.95448,...,0.086914,0.182642,POINT (-73.95448 40.73287),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
2,utM-5navObsVA5sCRHobzA,madre-brooklyn-2,Madre,0,38,5.0,MISSING,newamerican,40.73311,-73.95798,...,0.086914,0.182642,POINT (-73.95798000000001 40.73311),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
3,L9SuMN3UsGipopWOe3pr9w,chiko-brooklyn-2,Chiko,0,36,5.0,MISSING,japanese|sushi,40.7319,-73.95422,...,0.086914,0.182642,POINT (-73.95421999999998 40.7319),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
4,vyKBwzRdNX4yiJDIFv37iw,oxomoco-brooklyn-2,Oxomoco,0,247,4.0,$$$,mexican,40.72991,-73.95548,...,0.086914,0.182642,POINT (-73.95548000000002 40.7299099),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0


In [31]:
bkntainc = bk[['NTACode','NTAName', 'Households', 'pct_0-25k',
       'pct_25k-50k', 'pct_50k-75k', 'pct_75k-100k', 'pct_100k-125k',
       'pct_125k-150k', 'pct_> 150k', 'latitude','longitude','geometry', 'num_0-25k', 'num_25k-50k',
       'num_50k-75k', 'num_75k-100k', 'num_100k-125k', 'num_125k-150k',
       'num_> 150k']].copy()
bkntainc.head()

Unnamed: 0,NTACode,NTAName,Households,pct_0-25k,pct_25k-50k,pct_50k-75k,pct_75k-100k,pct_100k-125k,pct_125k-150k,pct_> 150k,latitude,longitude,geometry,num_0-25k,num_25k-50k,num_50k-75k,num_75k-100k,num_100k-125k,num_125k-150k,num_> 150k
0,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.733203,-73.954967,POINT (-73.95496677 40.73320339),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
1,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.73287,-73.95448,POINT (-73.95448 40.73287),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
2,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.73311,-73.95798,POINT (-73.95798000000001 40.73311),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
3,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.7319,-73.95422,POINT (-73.95421999999998 40.7319),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
4,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.72991,-73.95548,POINT (-73.95548000000002 40.7299099),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0


In [32]:
bkntainc = bkntainc.drop_duplicates('NTACode')
bkntainc.head()

Unnamed: 0,NTACode,NTAName,Households,pct_0-25k,pct_25k-50k,pct_50k-75k,pct_75k-100k,pct_100k-125k,pct_125k-150k,pct_> 150k,latitude,longitude,geometry,num_0-25k,num_25k-50k,num_50k-75k,num_75k-100k,num_100k-125k,num_125k-150k,num_> 150k
0,BK76,Greenpoint,16223,0.20021,0.136843,0.151883,0.13598,0.105529,0.086914,0.182642,40.733203,-73.954967,POINT (-73.95496677 40.73320339),3248.0,2220.0,2464.0,2206.0,1712.0,1410.0,2963.0
183,BK90,East Williamsburg,16075,0.284044,0.165723,0.12367,0.116579,0.087403,0.06507,0.157512,40.72103,-73.94074,POINT (-73.94074000000001 40.72103),4566.0,2664.0,1988.0,1874.0,1405.0,1046.0,2532.0
348,BK77,Bushwick North,19367,0.290029,0.207415,0.169515,0.106108,0.084164,0.053441,0.089327,40.707587,-73.933271,POINT (-73.93327099999998 40.707587),5617.0,4017.0,3283.0,2055.0,1630.0,1035.0,1730.0
602,BK78,Bushwick South,26616,0.349451,0.207582,0.152991,0.101743,0.068643,0.044222,0.075368,40.709139,-73.937227,POINT (-73.93722679999998 40.7091391),9301.0,5525.0,4072.0,2708.0,1827.0,1177.0,2006.0
738,BK73,North Side-South Side,23862,0.192566,0.167337,0.113654,0.105859,0.088718,0.070321,0.261546,40.721189,-73.957,POINT (-73.95700045 40.72118902),4595.0,3993.0,2712.0,2526.0,2117.0,1678.0,6241.0


In [33]:
bk_price_inc = bkcounts.merge(bkntainc, how='inner', on='NTACode')
bk_price_inc.head()

Unnamed: 0,NTACode,price_$,price_$$,price_$$$,price_$$$$,MISSING,NTAName,Households,pct_0-25k,pct_25k-50k,...,latitude,longitude,geometry,num_0-25k,num_25k-50k,num_50k-75k,num_75k-100k,num_100k-125k,num_125k-150k,num_> 150k
0,BK09,29,46,6,0,16,Brooklyn Heights-Cobble Hill,11115,0.11507,0.108052,...,40.699268,-73.992311,POINT (-73.99231090000001 40.69926820000001),1279.0,1201.0,777.0,1231.0,1042.0,840.0,4745.0
1,BK17,35,54,10,2,21,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,26150,0.253805,0.2026,...,40.57709,-73.952661,POINT (-73.95266059999999 40.57709038),6637.0,5298.0,3965.0,2832.0,2384.0,1692.0,3342.0
2,BK19,22,17,4,1,14,Brighton Beach,14557,0.395823,0.185272,...,40.57672,-73.96427,POINT (-73.96427 40.57672),5762.0,2697.0,2311.0,990.0,681.0,521.0,1595.0
3,BK21,24,12,0,0,14,Seagate-Coney Island,11236,0.478907,0.203364,...,40.578805,-73.983811,POINT (-73.983811 40.578805),5381.0,2285.0,1447.0,711.0,642.0,240.0,530.0
4,BK23,6,4,2,0,4,West Brighton,8401,0.377217,0.21307,...,40.579194,-73.980654,POINT (-73.9806538 40.5791945),3169.0,1790.0,1250.0,962.0,428.0,266.0,536.0


In [34]:
bk_price_inc.shape

(51, 25)

In [36]:
bk_price_inc.to_csv(data_path + '/BK_YelpPriceFreq_NTAIncDist.csv')


In [37]:
# Creating another csv that has the yelp price rating broken down by percentage
bk_pct_price_inc = bk_counts_pct.merge(bkntainc, how='inner', on='NTACode')
bk_pct_price_inc.to_csv(data_path + '/BK_YelpPriceFreqPct_NTAIncDist.csv')
bk_pct_price_inc.shape

(51, 25)