In [21]:
import pandas as pd
import geopandas as gpd

# Current Function in download.py

In [3]:
final_df = merge_data_sources(acs5)

initial acs5 shape: 3123
inital number of colums: 107
--------------------------------------------------------------
df shape after merging with tracts_place: 6566
df number of colums: 114
--------------------------------------------------------------
df shape after mergin with transit score: 2097
df number of colums: 126
--------------------------------------------------------------
df shape after mergin with jobs data: 2094
df number of colums: 129
--------------------------------------------------------------
df shape after grouping: 1716
df number of colums: 115
df cols omitted becauseo of grouping: {'index', 'tsplace_GEO_ID', 'tract_name', 'job_tract_label', 'ws_link', 'place_name', 'place_name_and_type', 'transit_summary', 'place_GEO_ID', 'job_tract_GEO_ID', 'transit_description', 'tract_GEO_ID', 'city_from_ts', 'geometry'}
df num columns with zero population: 0


# Approach 1: AllTransit Data

### The steps are as follows:
    ***(Load ACS data)
    ***(Load jobs data)
    1. Load tracts shape files 
    ***(Merge ACS, jobs, and tracts)
    2. Load places shape files
    3. Add AllTransit data to places data (0-10)
    4. Spatial join AllTransit data at place-level with tracts
    5. Group by tract id to get average AllTransit score 

In [51]:
# Loading tracts
tracts = gpd.read_file('shape_tracts/tl_2018_17_tract.shp')
tracts = tracts[['GEOID', 'NAMELSAD', 'ALAND', 'geometry']] \
                        .rename(columns={'GEOID': 'tract_GEO_ID', 'NAMELSAD': 'tract_name',
                       'ALAND': 'tract_area'})
tracts['centroid'] = tracts.geometry.centroid
tracts['centroid_lng'] = tracts.centroid.x
tracts['centroid_lat'] = tracts.centroid.y
print('tracts shape: {}'.format(tracts.shape[0]))
print("--------------------------------------------------------------")

# Loading places shapefile
places = gpd.read_file('shape_places/tl_2018_17_place.shp')
places = places[['GEOID', 'NAME', 'NAMELSAD', 'geometry']] \
                        .rename(columns={'GEOID': 'place_GEO_ID', 'NAME': 'place_name',
                       'NAMELSAD': 'place_name_and_type'})
print('places shape: {}'.format(places.shape[0]))
print("--------------------------------------------------------------")

# Loading alltransit data 
alltransit = gpd.read_file('alltransit_places_17.csv')
alltransit['place_GEO_ID'] = alltransit['place'].str.strip('"')
alltransit = alltransit[['name', 'blkgrps', 'households', 'alltransit_performance_score', 'place_GEO_ID']]
alltransit['blkgrps'] = alltransit['blkgrps'].astype('float')
alltransit['households'] = alltransit['households'].astype('float')
alltransit['alltransit_performance_score'] = alltransit['alltransit_performance_score'].astype('float')
places_alltransit = places.merge(alltransit, on='place_GEO_ID')
print('places_alltransit shape: {}'.format(places_alltransit.shape[0]))
print("--------------------------------------------------------------")

# Spatial join patial places + alltransit with tracts
tracts_alltransit = gpd.sjoin(tracts, places_alltransit, how="inner", op="intersects")
print('tracts_alltransit shape: {}'.format(tracts_alltransit.shape[0]))
print("--------------------------------------------------------------")

# Grouping my tract to average transit score 
df = tracts_alltransit.groupby('tract_GEO_ID').mean().reset_index()
print('df shape after grouping: {}'.format(df.shape[0]))
print('df number of colums: {}'.format(len(df.columns)))

tracts shape: 3123
--------------------------------------------------------------
places shape: 1369
--------------------------------------------------------------
places_alltransit shape: 1369
--------------------------------------------------------------
tracts_alltransit shape: 6566
--------------------------------------------------------------
df shape after grouping: 3121
df number of colums: 8


In [58]:
#Finding census tracts that were dropped
df_all = tracts.merge(tracts_alltransit, on='tract_GEO_ID', how='left', indicator=True)[['tract_GEO_ID',
       'geometry_x', 'centroid_x', 'centroid_lng_x',
       'centroid_lat_x', 'place_GEO_ID', 'place_name',
       'place_name_and_type', 'name', 'blkgrps', 'households',
       'alltransit_performance_score', '_merge']]
print(df_all.columns)
df_all[df_all['_merge'] == 'left_only']

Index(['tract_GEO_ID', 'geometry_x', 'centroid_x', 'centroid_lng_x',
       'centroid_lat_x', 'place_GEO_ID', 'place_name', 'place_name_and_type',
       'name', 'blkgrps', 'households', 'alltransit_performance_score',
       '_merge'],
      dtype='object')


Unnamed: 0,tract_GEO_ID,geometry_x,centroid_x,centroid_lng_x,centroid_lat_x,place_GEO_ID,place_name,place_name_and_type,name,blkgrps,households,alltransit_performance_score,_merge
5122,17087980000,"POLYGON ((-88.78942 37.40702, -88.78934 37.407...",POINT (-88.76824 37.39958),-88.768243,37.39958,,,,,,,,left_only
5949,17097990000,"POLYGON ((-87.82118 42.29946, -87.82027 42.304...",POINT (-87.43182 42.32669),-87.431817,42.326686,,,,,,,,left_only


# Approach 2: Querying Transit API with Centroid of Census Tracts

In [61]:
tracts_places = gpd.sjoin(tracts, places, how="inner", op="intersects")
print(tracts_places.shape)
tracts_places.head(10)

(6566, 11)


Unnamed: 0,tract_GEO_ID,tract_name,tract_area,geometry,centroid,centroid_lng,centroid_lat,index_right,place_GEO_ID,place_name,place_name_and_type
0,17091011700,Census Tract 117,2370100,"POLYGON ((-87.88768 41.13594, -87.88764 41.136...",POINT (-87.87355 41.12949),-87.873553,41.12949,15,1707744,Bradley,Bradley village
1,17091011800,Census Tract 118,1790218,"POLYGON ((-87.89410 41.14388, -87.89400 41.143...",POINT (-87.87646 41.13977),-87.876462,41.139775,15,1707744,Bradley,Bradley village
526,17091012000,Census Tract 120,3375651,"POLYGON ((-87.89489 41.16987, -87.89386 41.169...",POINT (-87.87876 41.16053),-87.878765,41.160534,15,1707744,Bradley,Bradley village
602,17091011600,Census Tract 116,1814361,"POLYGON ((-87.86505 41.12441, -87.86502 41.124...",POINT (-87.85632 41.12787),-87.856322,41.127865,15,1707744,Bradley,Bradley village
1701,17091010500,Census Tract 105,16817460,"POLYGON ((-88.01201 41.21281, -88.01107 41.212...",POINT (-87.93018 41.17105),-87.930179,41.171049,15,1707744,Bradley,Bradley village
1702,17091011900,Census Tract 119,721000,"POLYGON ((-87.87613 41.14375, -87.87595 41.144...",POINT (-87.86804 41.14539),-87.868037,41.145393,15,1707744,Bradley,Bradley village
1961,17091010602,Census Tract 106.02,10215760,"POLYGON ((-87.87687 41.20664, -87.87654 41.206...",POINT (-87.86344 41.18516),-87.863442,41.185161,15,1707744,Bradley,Bradley village
1962,17091010702,Census Tract 107.02,25273534,"POLYGON ((-87.86641 41.15873, -87.86638 41.159...",POINT (-87.81688 41.14882),-87.816878,41.148823,15,1707744,Bradley,Bradley village
2195,17091010202,Census Tract 102.02,60837120,"POLYGON ((-87.86743 41.21718, -87.86736 41.217...",POINT (-87.81773 41.24481),-87.817734,41.244808,15,1707744,Bradley,Bradley village
2196,17091010701,Census Tract 107.01,27978606,"POLYGON ((-87.85205 41.19078, -87.85184 41.191...",POINT (-87.81211 41.18274),-87.81211,41.182744,15,1707744,Bradley,Bradley village


In [68]:
chicago_tracts = tracts_places[tracts_places['place_name']=='Chicago']

# Code for Querying TransitScore API

In [71]:
from pandas.io.json import json_normalize
import requests 

def get_data(lat, lng, city):
    '''
    Gets data for one location.
    
    Inputs:
        lat, lng (str)
    Returns:
        (json)
    '''
    rv = None
    over_quota = False
    
    #key = '4c4eb18a1eea25128110eaf683aefab4' #Nguyen og key
    #key = 'ffd1c56f9abcf84872116b4cc2dfcf31' #Mike key1
    #key = '4c4eb18a1eea25128110eaf683aefab4' #Mike key2
    #key = 'ffd1c56f9abcf84872116b4cc2dfcf31' #Nathan key1
    #key = '4c4eb18a1eea25128110eaf683aefab4' #Nathan key2
    key = 'ffd1c56f9abcf84872116b4cc2dfcf31' #NEW KEY
    
    url = 'https://transit.walkscore.com/transit/score/?lat={}&lon={}&city={}&state=IL&wsapikey={}'.format(lat, lng, city, key)
    
    r = requests.get(url)
    if not r.status_code==400:
        rv = r.json()
    if r.text == 'Over quota.':
        over_quota = True
        print('error: over quota')
    print(r)
    
    return rv, over_quota

In [72]:
def create_datalist(df):
    '''
    Turns list of jsons into a dataframe
    
    Inputs:
        acsse data frame (pandas Dataframe)
    Returns:
        list containing transit score for all places for which it is available (list of json)
    '''
    datalist = []
    over_quota_count = 0
    
    for row in df.itertuples():
        print(row.centroid_lat, row.centroid_lng, row.place_name)
        json, quota_bool = get_data(row.centroid_lat, row.centroid_lng, row.place_name)
        if not json is None:
            json['tract_GEO_ID'] = row.tract_GEO_ID
            json['centroid_lat'] = row.centroid_lat
            json['centroid_lng'] = row.centroid_lng
            datalist.append(json)
        if quota_bool:
            over_quota_count += 1
    
    return datalist, over_quota_count

In [74]:
datalist = create_datalist(chicago_tracts)

41.80274526059321 -87.69404527250397 Chicago
<Response [200]>
41.76219787119815 -87.5903116152784 Chicago
<Response [200]>
41.76441328813643 -87.57137298777623 Chicago
<Response [200]>
41.76075000780601 -87.57129826090427 Chicago
<Response [200]>
41.76431388102515 -87.58126994948839 Chicago
<Response [200]>
41.760640961857824 -87.58118514144209 Chicago
<Response [200]>
41.8161780010653 -87.67510211963132 Chicago
<Response [200]>
41.77603986887358 -87.6495479476521 Chicago
<Response [200]>
41.76876652787459 -87.64935954801622 Chicago
<Response [200]>
41.790563991479864 -87.65235243876248 Chicago
<Response [200]>
41.776430537437136 -87.637630384517 Chicago
<Response [200]>
42.026154357080706 -87.66706269255916 Chicago
<Response [200]>
41.72761654885647 -87.6747356945031 Chicago
<Response [200]>
41.7130418517068 -87.66605461067454 Chicago
<Response [200]>
41.71392006172718 -87.67714391600353 Chicago
<Response [200]>
41.70999186176503 -87.68666864312283 Chicago
<Response [200]>
41.70283530

<Response [200]>
41.91236930864402 -87.6750677020363 Chicago
<Response [200]>
41.911968330020684 -87.6836267273753 Chicago
<Response [200]>
41.950673215087036 -87.66653643331286 Chicago
<Response [200]>
41.95272000049414 -87.66050075074342 Chicago
<Response [200]>
41.91207424533214 -87.69948016304514 Chicago
<Response [200]>
41.91213415027694 -87.68971114874036 Chicago
<Response [200]>
41.91208778079127 -87.69459863700418 Chicago
<Response [200]>
41.904716186832445 -87.70188668146226 Chicago
<Response [200]>
41.90056013235442 -87.69419701935905 Chicago
<Response [200]>
41.856333064308075 -87.65956370300141 Chicago
<Response [200]>
41.8559903405074 -87.6649061181708 Chicago
<Response [200]>
41.85593372130763 -87.66857374611115 Chicago
<Response [200]>
42.0013156187228 -87.69363728155903 Chicago
<Response [200]>
41.85577165360453 -87.67467409535564 Chicago
<Response [200]>
41.71310287053611 -87.54989037857096 Chicago
<Response [200]>
41.71426543294736 -87.56004956750647 Chicago
<Response

<Response [200]>
41.89247817424054 -87.66473878651617 Chicago
<Response [200]>
41.89255798954084 -87.65984792427027 Chicago
<Response [200]>
41.92215307591395 -87.6294059796152 Chicago
<Response [200]>
41.97295366781568 -87.77261142840995 Chicago
<Response [200]>
41.949473096301936 -87.77175783371868 Chicago
<Response [200]>
41.94959156460801 -87.76195802891986 Chicago
<Response [200]>
41.949708612798055 -87.75211816585293 Chicago
<Response [200]>
41.94229034323704 -87.76171751796598 Chicago
<Response [200]>
41.942168985833895 -87.77149534842334 Chicago
<Response [200]>
41.908374024740276 -87.6709298551353 Chicago
<Response [200]>
41.90664032016157 -87.68958010522516 Chicago
<Response [200]>
41.90670971179178 -87.66532774045673 Chicago
<Response [200]>
41.906771661230216 -87.68102499823829 Chicago
<Response [200]>
41.90603118704052 -87.67531376105792 Chicago
<Response [200]>
41.89265360988218 -87.65251724729745 Chicago
<Response [200]>
41.91235084585062 -87.75107298799752 Chicago
<Resp

<Response [200]>
41.87104221392834 -87.63140269557016 Chicago
<Response [200]>
41.74820975059761 -87.61007386117005 Chicago
<Response [200]>
41.65021982681619 -87.59979212902076 Chicago
<Response [200]>
41.72756028822689 -87.62950436589648 Chicago
<Response [200]>
41.731877179046236 -87.54785286272997 Chicago
<Response [200]>
41.871366868861806 -87.6886812249277 Chicago
<Response [200]>
41.870268395305494 -87.68491363569856 Chicago
<Response [200]>
41.793104224908916 -87.6131830257535 Chicago
<Response [200]>
41.79104146933948 -87.62317914622952 Chicago
<Response [200]>
41.78286330067333 -87.6205580062742 Chicago
<Response [200]>
41.80150187741048 -87.57934329745312 Chicago
<Response [200]>
41.797830877089815 -87.60374228499697 Chicago
<Response [200]>
41.797968756217415 -87.59894215267497 Chicago
<Response [200]>
41.74614698996334 -87.53572874177262 Chicago
<Response [200]>
41.74194387568329 -87.5790894038454 Chicago
<Response [200]>
41.741387211912766 -87.54701006752616 Chicago
<Resp

<Response [200]>
41.7957700827239 -87.79165139091673 Chicago
<Response [200]>
41.78850207783334 -87.79150325865757 Chicago
<Response [200]>
42.01594074001859 -87.66653510545581 Chicago
<Response [200]>
42.00544005381296 -87.65716651278969 Chicago
<Response [200]>
42.006277592259664 -87.66768304345422 Chicago
<Response [200]>
42.01572106603363 -87.68711693200412 Chicago
<Response [200]>
41.878596769033 -87.68890087411113 Chicago
<Response [200]>
41.78322409382214 -87.59869701520591 Chicago
<Response [200]>
42.00308735349545 -87.8726345733529 Chicago
<Response [200]>
41.995658310509974 -87.79431458049942 Chicago
<Response [200]>
42.01569025362034 -87.6995191177754 Chicago
<Response [200]>
41.9808498711194 -87.73606600979367 Chicago
<Response [200]>
42.00848802720408 -87.68591186055112 Chicago
<Response [200]>
41.90585893524394 -87.63090096054172 Chicago
<Response [200]>
41.91244167304762 -87.67018161625876 Chicago
<Response [200]>
41.79764778873271 -87.66711377792961 Chicago
<Response [2

<Response [200]>
41.90632297290312 -87.72153968934853 Chicago
<Response [200]>
41.90583962225873 -87.71495301586033 Chicago
<Response [200]>
41.90669182901313 -87.71035050934005 Chicago
<Response [200]>
41.75386030519256 -87.80480616998189 Chicago
<Response [200]>
41.76289811973161 -87.77452187904584 Chicago
<Response [200]>
41.67594737359242 -87.75381895921176 Chicago
<Response [200]>
41.681416170735226 -87.70276951425448 Chicago
<Response [200]>
41.81829916339181 -87.77301151048168 Chicago
<Response [200]>
41.786578280836764 -87.67640438563645 Chicago
<Response [200]>
41.810219713897254 -87.68664360022211 Chicago
<Response [200]>
41.81380605175953 -87.62376797891577 Chicago
<Response [200]>
42.00198220055021 -87.77500735427209 Chicago
<Response [200]>
41.906154934308226 -87.73399855925338 Chicago
<Response [200]>
41.8995065454205 -87.67960033612403 Chicago
<Response [200]>
41.89958936752496 -87.67471876526038 Chicago
<Response [200]>
41.95815388295837 -87.65301789121115 Chicago
<Resp

<Response [200]>
41.74862709756805 -87.6014511858359 Chicago
<Response [200]>
41.842480529246636 -87.71962538619 Chicago
<Response [200]>
41.88102698390461 -87.63274486673286 Chicago
<Response [200]>
41.87907767420261 -87.65704585639726 Chicago
<Response [200]>
41.87866686389833 -87.6716887669734 Chicago
<Response [200]>
41.87169458300137 -87.65412373420163 Chicago
<Response [200]>
41.870415693242435 -87.6750793725433 Chicago
<Response [200]>
41.96354820996094 -87.71357531282779 Chicago
<Response [200]>
41.964652096331335 -87.72101658194876 Chicago
<Response [200]>
41.967037479024626 -87.71265244888546 Chicago
<Response [200]>
41.95558410190805 -87.71308765177974 Chicago
<Response [200]>
41.956571852936875 -87.70017370851394 Chicago
<Response [200]>
41.94249177488856 -87.74715538394558 Chicago
<Response [200]>
41.942382663325816 -87.75435269944397 Chicago
<Response [200]>
41.93485963725667 -87.77123177067446 Chicago
<Response [200]>
41.753530166214794 -87.56018712379178 Chicago
<Respon

In [80]:
datalist[0]

[{'transit_score': 64,
  'help_link': 'https://www.redfin.com/how-walk-score-works',
  'summary': '11 nearby routes: 10 bus, 1 rail, 0 other',
  'logo_url': 'https://cdn.walk.sc/images/transit-score-logo.png',
  'ws_link': 'https://www.walkscore.com/score/loc/lat=41.8027/lng=-87.6940/?utm_source=gmail.com&utm_medium=ts_api&utm_campaign=ts_api',
  'description': 'Good Transit',
  'tract_GEO_ID': '17031630200',
  'centroid_lat': 41.80274526059321,
  'centroid_lng': -87.69404527250397},
 {'transit_score': 64,
  'help_link': 'https://www.redfin.com/how-walk-score-works',
  'summary': '10 nearby routes: 9 bus, 1 rail, 0 other',
  'logo_url': 'https://cdn.walk.sc/images/transit-score-logo.png',
  'ws_link': 'https://www.walkscore.com/score/loc/lat=41.7622/lng=-87.5903/?utm_source=gmail.com&utm_medium=ts_api&utm_campaign=ts_api',
  'description': 'Good Transit',
  'tract_GEO_ID': '17031430400',
  'centroid_lat': 41.76219787119815,
  'centroid_lng': -87.5903116152784},
 {'transit_score': 67,
 

In [86]:
chi_df = pd.DataFrame.from_dict(json_normalize(datalist[0]), orient='columns')
chi_df.head(10)

Unnamed: 0,centroid_lat,centroid_lng,description,help_link,logo_url,summary,tract_GEO_ID,transit_score,ws_link
0,41.802745,-87.694045,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"11 nearby routes: 10 bus, 1 rail, 0 other",17031630200,64,https://www.walkscore.com/score/loc/lat=41.802...
1,41.762198,-87.590312,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"10 nearby routes: 9 bus, 1 rail, 0 other",17031430400,64,https://www.walkscore.com/score/loc/lat=41.762...
2,41.764413,-87.571373,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"10 nearby routes: 9 bus, 1 rail, 0 other",17031430600,67,https://www.walkscore.com/score/loc/lat=41.764...
3,41.76075,-87.571298,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"10 nearby routes: 9 bus, 1 rail, 0 other",17031430800,68,https://www.walkscore.com/score/loc/lat=41.760...
4,41.764314,-87.58127,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"12 nearby routes: 11 bus, 1 rail, 0 other",17031430500,67,https://www.walkscore.com/score/loc/lat=41.764...
5,41.760641,-87.581185,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"12 nearby routes: 11 bus, 1 rail, 0 other",17031430900,69,https://www.walkscore.com/score/loc/lat=41.760...
6,41.816178,-87.675102,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"10 nearby routes: 9 bus, 1 rail, 0 other",17031610300,58,https://www.walkscore.com/score/loc/lat=41.816...
7,41.77604,-87.649548,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"9 nearby routes: 7 bus, 2 rail, 0 other",17031681000,65,https://www.walkscore.com/score/loc/lat=41.776...
8,41.768767,-87.64936,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"9 nearby routes: 8 bus, 1 rail, 0 other",17031681100,62,https://www.walkscore.com/score/loc/lat=41.768...
9,41.790564,-87.652352,Good Transit,https://www.redfin.com/how-walk-score-works,https://cdn.walk.sc/images/transit-score-logo.png,"7 nearby routes: 7 bus, 0 rail, 0 other",17031680500,59,https://www.walkscore.com/score/loc/lat=41.790...


In [87]:
chi_df.to_pickle('chicago_transitscores.pkl')