In [1]:
import pandas as pd
import geopandas as gpd

In [8]:
def merge_data_sources(acs5):
    '''
    Links acs5 data with transit score and job data. Calculates population 
    density and job density.
    Inputs:
        acs5 (pandas DataFrame)
    Outputs:
        (full pandas DataFrame with transit score data)
    '''
    #Extracting census tract ID
    acs5['tract_GEO_ID'] = acs5['GEO_ID'].apply(lambda x: x[9:])

    #Loading tracts
    tracts = gpd.read_file('shape_tracts/tl_2018_17_tract.shp')
    tracts = tracts[['GEOID', 'NAMELSAD', 'ALAND', 'geometry']] \
                        .rename(columns={'GEOID': 'tract_GEO_ID', 'NAMELSAD': 'tract_name',
                       'ALAND': 'tract_area'})

    #Loading places
    places = gpd.read_file('shape_places/tl_2018_17_place.shp')
    places = places[['GEOID', 'NAME', 'NAMELSAD', 'geometry']] \
                        .rename(columns={'GEOID': 'place_GEO_ID', 'NAME': 'place_name',
                       'NAMELSAD': 'place_name_and_type'})

    #Merging tracts and places
    tracts_places = gpd.sjoin(tracts, places, how="inner", op="intersects")

    #Merging acs data with traces/places
    df = pd.merge(acs5, tracts_places, left_on='tract_GEO_ID', right_on='tract_GEO_ID')

    #Importing transit score csv and merging
    ts = pd.read_csv('transit_score.csv').rename(columns={'nearby_routes': 'num_nearby_routes', \
         'bus': 'num_bus_routes', 'rail': 'num_rail_routes', 'other': 'num_other_routes', \
         'city': 'city_from_ts', 'description': 'transit_description', 'summary': 'transit_summary', \
         'Lat': 'lat', 'Lon': 'lon'})
    ts['tsplace_GEO_ID'] = ts['GEO_ID'].apply(lambda x: x[9:])
    ts = ts.drop(columns=['censusgeo', 'Place_Type', 'state', 'GEO_ID'])
    df = pd.merge(df, ts, how='inner', left_on='place_GEO_ID', right_on='tsplace_GEO_ID')

    #Importing jobs by tract and merging
    jobs = pd.read_csv('il_jobs_by_tract_2017.csv')
    jobs = jobs[['id', 'label', 'c000']] \
            .rename(columns={'id': 'job_tract_GEO_ID', 'label': 'job_tract_label',
                             'c000': 'num_jobs'})
    jobs['job_tract_GEO_ID'] = jobs['job_tract_GEO_ID'].astype(str)
    df = pd.merge(jobs, df, how='inner', left_on='job_tract_GEO_ID', right_on='tract_GEO_ID')

    #Averaging transit score for census tracts
    df = df.groupby('GEO_ID').mean().reset_index()

    #Calculating population density and job density
    df['job_density'] = df['num_jobs'] / ((df['tract_area'])/1000000)
    df['pop_density'] = df['race_total'] / ((df['tract_area'])/1000000)

    #Taking care of null values using pipline methods
    cols_with_null = explore_df_summary_stats(df)
    final_df, replacement = impute(df, cols_with_null)

    return final_df

In [10]:
acs5 = pd.read_pickle('raw_acs5.pkl')
from download import merge_data_sources

ModuleNotFoundError: No module named 'download'

In [7]:
for col in .columns:
    print(col)

GEO_ID
num_jobs
car_avail_none
car_avail_total
commut_took_public_trans
commut_total
commute_time_10_to_14_min
commute_time_15_to_19_min
commute_time_20_to_24_min
commute_time_25_to_29_min
commute_time_30_to_34_min
commute_time_35_to_39_min
commute_time_40_to_44_min
commute_time_45_to_59_min
commute_time_5_to_9_min
commute_time_60_to_89_min
commute_time_90_plus_min
commute_time_less_5_min
commute_time_total
disabl_none_18_to_64
disabl_none_65_plus
disabl_none_under_18
disabl_status_total
educ_attnmt_HS_GED
educ_attnmt_HS_reg
educ_attnmt_bachelors
educ_attnmt_grad_doctorate
educ_attnmt_grad_masters
educ_attnmt_grad_prof_degree
educ_attnmt_some_college_associates
educ_attnmt_some_college_less_1_yr
educ_attnmt_some_college_more_1_yr
educ_attnmt_total
emp_men_16_to_19_in_lbr_force
emp_men_20_to_21_in_lbr_force
emp_men_22_to_24_in_lbr_force
emp_men_25_to_29_in_lbr_force
emp_men_60_to_61_in_lbr_force
emp_men_62_to_64_in_lbr_force
emp_men_65_to_69_in_lbr_force
emp_men_70_to_74_in_lbr_force
em