# Census Data Aggregation

In [39]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os

# go into census folder, get file names
os.chdir("/Users/ashwin/Comp/census")
census_files = os.listdir()

# split file names into 2011, 2012 and 2016 lists
census_files = [file for file in census_files if file.endswith("ann.csv")]
census_files_2011 = [file for file in census_files if file.startswith("ACS_11_5YR")]
census_files_2012 = [file for file in census_files if file.startswith("ACS_12_5YR")]
census_files_2016 = [file for file in census_files if file.startswith("ACS_16_5YR")]

In [2]:
census_files_2011

['ACS_11_5YR_B01001_with_ann.csv',
 'ACS_11_5YR_B15002_with_ann.csv',
 'ACS_11_5YR_B17017_with_ann.csv',
 'ACS_11_5YR_B19313_with_ann.csv',
 'ACS_11_5YR_B23001_with_ann.csv',
 'ACS_11_5YR_B25014_with_ann.csv']

In [3]:
census_files_2012

['ACS_12_5YR_B01001_with_ann.csv',
 'ACS_12_5YR_B15002_with_ann.csv',
 'ACS_12_5YR_B17017_with_ann.csv',
 'ACS_12_5YR_B19313_with_ann.csv',
 'ACS_12_5YR_B23001_with_ann.csv',
 'ACS_12_5YR_B25014_with_ann.csv']

In [4]:
census_files_2016

['ACS_16_5YR_B01001_with_ann.csv',
 'ACS_16_5YR_B15002_with_ann.csv',
 'ACS_16_5YR_B17017_with_ann.csv',
 'ACS_16_5YR_B19313_with_ann.csv',
 'ACS_16_5YR_B23001_with_ann.csv',
 'ACS_16_5YR_B25014_with_ann.csv']

In [5]:
# keys for eventual dictionary, matching order of above filenames
hardship_vars = ["age", "education", "poverty", "income", "unemployment", "crowded_housing"]

def get_frames(var_names, file_names):
    ''' simple helper function for creating dictionary mapping variable names to their dataframe'''
    os.chdir("/Users/ashwin/Comp/census")
    frames = []
    for file in file_names:
        # set index column to census tract id column, use annotations at row 1 for header
        frames.append(pd.read_csv(file, index_col = 1, header = 1))
    
    os.chdir("..")
    
    return dict(zip(var_names, frames))

In [8]:
# for our comparison 
acs_2011 = get_frames(hardship_vars, census_files_2011)
acs_2016 = get_frames(hardship_vars, census_files_2016)

# to compare with city's existing entry
acs_2012 = get_frames(hardship_vars, census_files_2012)

### Aggregate Variables

Let's now list all of the variables we need from each table, as lists that we can pass as a list of columns to get from respective dataframes. 

#### Dependents

In [10]:
acs_2011["age"].head()

Unnamed: 0_level_0,Id,Geography,Estimate; Total:,Margin of Error; Total:,Estimate; Male:,Margin of Error; Male:,Estimate; Male: - Under 5 years,Margin of Error; Male: - Under 5 years,Estimate; Male: - 5 to 9 years,Margin of Error; Male: - 5 to 9 years,...,Estimate; Female: - 67 to 69 years,Margin of Error; Female: - 67 to 69 years,Estimate; Female: - 70 to 74 years,Margin of Error; Female: - 70 to 74 years,Estimate; Female: - 75 to 79 years,Margin of Error; Female: - 75 to 79 years,Estimate; Female: - 80 to 84 years,Margin of Error; Female: - 80 to 84 years,Estimate; Female: - 85 years and over,Margin of Error; Female: - 85 years and over
Id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,1400000US17031010100,"Census Tract 101, Cook County, Illinois",5203,688,2190,390,269,117,135,92,...,90,59,42,51,11,18,12,18,13,14
17031010201,1400000US17031010201,"Census Tract 102.01, Cook County, Illinois",6770,750,3471,507,429,128,143,115,...,24,36,65,60,87,56,0,89,15,24
17031010202,1400000US17031010202,"Census Tract 102.02, Cook County, Illinois",2708,562,1391,280,161,104,26,33,...,18,28,16,16,17,22,18,29,53,84
17031010300,1400000US17031010300,"Census Tract 103, Cook County, Illinois",6282,837,3203,505,210,125,112,130,...,85,52,82,55,75,74,115,125,236,143
17031010400,1400000US17031010400,"Census Tract 104, Cook County, Illinois",4737,686,2106,477,124,72,46,39,...,85,76,26,29,22,27,32,52,22,25


In [36]:
# dependents are defined to be younger than 18 or over 65; get all columns for male and female and errors
dependent_ages = ["Under 5", "5 to 9", "10 to 14", "15 to 17", "65 and 66", "67 to 69", 
              "70 to 74", "75 to 79", "80 to 84", "85"]

male_dependents = ["Estimate; Male: - {} years".format(interval) for interval in dependent_ages]
male_dependents[-1] = male_dependents[-1]  + " and over"

female_dependents = [value.replace("Male", "Female") for value in male_dependents]

dependents = male_dependents + female_dependents
dependents_error = [value.replace("Estimate", "Margin of Error") for value in dependents]

#### Education

In [11]:
acs_2011["education"].head()

Unnamed: 0_level_0,Id,Geography,Estimate; Total:,Margin of Error; Total:,Estimate; Male:,Margin of Error; Male:,Estimate; Male: - No schooling completed,Margin of Error; Male: - No schooling completed,Estimate; Male: - Nursery to 4th grade,Margin of Error; Male: - Nursery to 4th grade,...,Estimate; Female: - Associate's degree,Margin of Error; Female: - Associate's degree,Estimate; Female: - Bachelor's degree,Margin of Error; Female: - Bachelor's degree,Estimate; Female: - Master's degree,Margin of Error; Female: - Master's degree,Estimate; Female: - Professional school degree,Margin of Error; Female: - Professional school degree,Estimate; Female: - Doctorate degree,Margin of Error; Female: - Doctorate degree
Id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,1400000US17031010100,"Census Tract 101, Cook County, Illinois",3182,387,1509,292,0,89,0,89,...,104,76,375,117,91,60,16,27,43,43
17031010201,1400000US17031010201,"Census Tract 102.01, Cook County, Illinois",4348,416,2314,340,32,50,23,44,...,161,111,399,136,163,83,13,29,13,21
17031010202,1400000US17031010202,"Census Tract 102.02, Cook County, Illinois",1999,376,1069,218,36,79,0,89,...,58,39,105,60,54,56,31,48,22,37
17031010300,1400000US17031010300,"Census Tract 103, Cook County, Illinois",4725,526,2389,315,27,42,3,6,...,47,43,384,138,286,112,13,20,15,22
17031010400,1400000US17031010400,"Census Tract 104, Cook County, Illinois",2675,293,1186,179,0,89,0,89,...,23,25,409,117,322,108,14,22,56,50


In [12]:
# educational attainment defined as people over the age of 25 without a high school education
below_highschool = ["No schooling completed", "Nursery to 4th grade", "5th and 6th grade", "7th and 8th grade", 
                    "9th grade", "10th grade", "11th grade", "12th grade, no diploma"]

male_education = ["Male: - {}".format(grades) for grades in below_highschool]
female_education = [grades.replace("Male", "Female") for grades in male_education]

education = male_education + female_education

education = ["Estimate; {}".format(grades) for grades in education]
education_error = [grades.replace("Estimate", "Margin of Error") for grades in education]

#### Poverty

In [13]:
acs_2011["poverty"].head()

Unnamed: 0_level_0,Id,Geography,Estimate; Total:,Margin of Error; Total:,Estimate; Income in the past 12 months below poverty level:,Margin of Error; Income in the past 12 months below poverty level:,Estimate; Income in the past 12 months below poverty level: - Family households:,Margin of Error; Income in the past 12 months below poverty level: - Family households:,Estimate; Income in the past 12 months below poverty level: - Family households: - Married-couple family:,Margin of Error; Income in the past 12 months below poverty level: - Family households: - Married-couple family:,...,Estimate; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder:,Margin of Error; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder:,Estimate; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder under 25 years,Margin of Error; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder under 25 years,Estimate; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder 25 to 44 years,Margin of Error; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder 25 to 44 years,Estimate; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder 45 to 64 years,Margin of Error; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder 45 to 64 years,Estimate; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder 65 years and over,Margin of Error; Income in the past 12 months at or above poverty level: - Nonfamily households: - Female householder: - Householder 65 years and over
Id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,1400000US17031010100,"Census Tract 101, Cook County, Illinois",2176,188,616,193,270,130,15,24,...,423,149,69,66,193,84,121,71,40,40
17031010201,1400000US17031010201,"Census Tract 102.01, Cook County, Illinois",2584,165,512,168,291,118,121,81,...,288,109,11,16,137,82,97,60,43,42
17031010202,1400000US17031010202,"Census Tract 102.02, Cook County, Illinois",1136,105,269,106,88,55,21,33,...,157,87,0,89,22,26,101,73,34,39
17031010300,1400000US17031010300,"Census Tract 103, Cook County, Illinois",2734,206,548,155,204,129,82,77,...,447,133,10,18,189,111,206,95,42,38
17031010400,1400000US17031010400,"Census Tract 104, Cook County, Illinois",1977,144,434,140,94,66,55,60,...,533,134,68,66,158,67,243,91,64,59


In [14]:
# poverty is defined as number of households with income in the past 12 months below the poverty level
poverty = ["Estimate; Income in the past 12 months below poverty level:"]
poverty_error = ["Margin of Error; Income in the past 12 months below poverty level:"]

#### Crowded Housing

In [15]:
acs_2011["crowded_housing"].head()

Unnamed: 0_level_0,Id,Geography,Estimate; Total:,Margin of Error; Total:,Estimate; Owner occupied:,Margin of Error; Owner occupied:,Estimate; Owner occupied: - 0.50 or less occupants per room,Margin of Error; Owner occupied: - 0.50 or less occupants per room,Estimate; Owner occupied: - 0.51 to 1.00 occupants per room,Margin of Error; Owner occupied: - 0.51 to 1.00 occupants per room,...,Estimate; Renter occupied: - 0.50 or less occupants per room,Margin of Error; Renter occupied: - 0.50 or less occupants per room,Estimate; Renter occupied: - 0.51 to 1.00 occupants per room,Margin of Error; Renter occupied: - 0.51 to 1.00 occupants per room,Estimate; Renter occupied: - 1.01 to 1.50 occupants per room,Margin of Error; Renter occupied: - 1.01 to 1.50 occupants per room,Estimate; Renter occupied: - 1.51 to 2.00 occupants per room,Margin of Error; Renter occupied: - 1.51 to 2.00 occupants per room,Estimate; Renter occupied: - 2.01 or more occupants per room,Margin of Error; Renter occupied: - 2.01 or more occupants per room
Id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,1400000US17031010100,"Census Tract 101, Cook County, Illinois",2176,188,436,110,306,102,120,64,...,844,170,757,217,124,107,15,25,0,89
17031010201,1400000US17031010201,"Census Tract 102.01, Cook County, Illinois",2584,165,902,181,663,185,226,109,...,841,198,612,160,171,86,58,60,0,89
17031010202,1400000US17031010202,"Census Tract 102.02, Cook County, Illinois",1136,105,319,128,242,116,77,55,...,535,140,184,86,61,64,37,42,0,89
17031010300,1400000US17031010300,"Census Tract 103, Cook County, Illinois",2734,206,1007,212,794,199,196,115,...,1099,223,478,186,95,92,49,62,6,12
17031010400,1400000US17031010400,"Census Tract 104, Cook County, Illinois",1977,144,682,142,560,148,122,72,...,858,159,323,135,45,57,55,45,14,24


In [16]:
# crowded housing defined as homes with more than 1 occupant per room; get owner occupied and rented
crowded_numbers = ["1.01 to 1.50", "1.51 to 2.00", "2.01 or more"]

crowded_numbers = ["{} occupants per room".format(interval) for interval in crowded_numbers]

owned_crowded = ["Owner occupied: - {}".format(interval) for interval in crowded_numbers]
rented_crowded = ["Renter occupied: - {}".format(interval) for interval in crowded_numbers]

crowded = owned_crowded + rented_crowded
crowded = ["Estimate; {}".format(val) for val in crowded]
crowded_error = [val.replace("Estimate", "Margin of Error") for val in crowded]

#### Unemployment 

In [17]:
acs_2011["unemployment"].head()

Unnamed: 0_level_0,Id,Geography,Estimate; Total:,Margin of Error; Total:,Estimate; Male:,Margin of Error; Male:,Estimate; Male: - 16 to 19 years:,Margin of Error; Male: - 16 to 19 years:,Estimate; Male: - 16 to 19 years: - In labor force:,Margin of Error; Male: - 16 to 19 years: - In labor force:,...,Estimate; Female: - 75 years and over:,Margin of Error; Female: - 75 years and over:,Estimate; Female: - 75 years and over: - In labor force:,Margin of Error; Female: - 75 years and over: - In labor force:,Estimate; Female: - 75 years and over: - In labor force: - Employed,Margin of Error; Female: - 75 years and over: - In labor force: - Employed,Estimate; Female: - 75 years and over: - In labor force: - Unemployed,Margin of Error; Female: - 75 years and over: - In labor force: - Unemployed,Estimate; Female: - 75 years and over: - Not in labor force,Margin of Error; Female: - 75 years and over: - Not in labor force
Id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,1400000US17031010100,"Census Tract 101, Cook County, Illinois",3757,526,1615,347,0,89,0,89,...,36,31,0,89,0,89,0,89,36,31
17031010201,1400000US17031010201,"Census Tract 102.01, Cook County, Illinois",5288,553,2703,401,66,60,0,89,...,102,61,17,25,17,25,0,89,85,57
17031010202,1400000US17031010202,"Census Tract 102.02, Cook County, Illinois",2309,515,1185,248,47,51,20,31,...,88,95,0,89,0,89,0,89,88,95
17031010300,1400000US17031010300,"Census Tract 103, Cook County, Illinois",5255,583,2636,351,99,100,51,77,...,426,206,0,89,0,89,0,89,426,206
17031010400,1400000US17031010400,"Census Tract 104, Cook County, Illinois",4332,640,1881,460,343,235,126,80,...,76,64,0,89,0,89,0,89,76,64


In [18]:
# unemployed is defined as being over 16 and unemployed (i.e., in the labor force)
unemployed_years =  ["16 to 19", "20 and 21", "22 to 24", "25 to 29", "30 to 34", "35 to 44", "45 to 54", "55 to 59",
              "60 and 61", "62 to 64", "65 to 69", "70 to 74", "75"]

unemployed_years = ["{} years: - In labor force: - Civilian: - Unemployed".format(interval) for interval in unemployed_years]
# fix last intervals (no armed forces over 65, last value should be and over)
unemployed_years[-3:] = [interval.replace("Civilian: - ", "") for interval in unemployed_years[-3:]]
unemployed_years[-1] = unemployed_years[-1].replace("years", "years and over")

male_unemployed = ["Male: - {}".format(interval) for interval in unemployed_years]
female_unemployed = [interval.replace("Male", "Female") for interval in male_unemployed]

unemployed = male_unemployed + female_unemployed
unemployed = ["Estimate; {}".format(value) for value in unemployed]
unemployed_error = [value.replace("Estimate", "Margin of Error") for value in unemployed]

#### Income

In [19]:
acs_2011["income"].head()

Unnamed: 0_level_0,Id,Geography,Estimate; Aggregate income in the past 12 months (in 2011 inflation-adjusted dollars),Margin of Error; Aggregate income in the past 12 months (in 2011 inflation-adjusted dollars)
Id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17031010100,1400000US17031010100,"Census Tract 101, Cook County, Illinois",106848200,16092060
17031010201,1400000US17031010201,"Census Tract 102.01, Cook County, Illinois",147879300,19686733
17031010202,1400000US17031010202,"Census Tract 102.02, Cook County, Illinois",62301300,13576441
17031010300,1400000US17031010300,"Census Tract 103, Cook County, Illinois",166689600,24506533
17031010400,1400000US17031010400,"Census Tract 104, Cook County, Illinois",126446300,17593240


We handle this differently, as we need to first get aggregate income by community area, then divide that by the total population of a community area (taken by the sex by age table). 

In [20]:
income = ["Estimate; Aggregate income in the past 12 months (in 2011 inflation-adjusted dollars)", 
         "Margin of Error; Aggregate income in the past 12 months (in 2011 inflation-adjusted dollars)"]

We now define a function to aggregate the columns defined by each list, values and errors, and also aggregate the census tracts to their corresponding community areas. 

We use the city's census tract boundaries for mapping to community area. 

In [27]:
census_tracts = gpd.read_file("Boundaries - Census Tracts - 2010.geojson")

In [28]:
census_tracts.head()

Unnamed: 0,statefp10,name10,commarea_n,namelsad10,commarea,geoid10,notes,tractce10,countyfp10,geometry
0,17,8424,44,Census Tract 8424,44,17031842400,,842400,31,(POLYGON ((-87.62404799998049 41.7302169999839...
1,17,8403,59,Census Tract 8403,59,17031840300,,840300,31,(POLYGON ((-87.6860799999848 41.82295600001154...
2,17,8411,34,Census Tract 8411,34,17031841100,,841100,31,(POLYGON ((-87.62934700001182 41.8527970000265...
3,17,8412,31,Census Tract 8412,31,17031841200,,841200,31,(POLYGON ((-87.68813499997718 41.8556909999909...
4,17,8382,28,Census Tract 8382,28,17031838200,,838200,31,(POLYGON ((-87.66781999997529 41.8741839999791...


In [29]:
# dictionary mapping tract id to community area
tract_to_comm = census_tracts.set_index("geoid10")["commarea"].to_dict()

In [30]:
def map_to_community(x):
    ''' helper function for assigning tract to community (or to -1 if outside chicago)'''
    if str(x) in tract_to_comm:
        return tract_to_comm[str(x)]
    else:
        return '-1'

We follow the guideline from the census [here](https://www2.census.gov/programs-surveys/acs/tech_docs/statistical_testing/2015StatisticalTesting5year.pdf) for estimating the standard error of aggregate. In particular for sums we have, 

$$\text{SE}(A) + \text{SE}(B) + \cdots = \sqrt{\text{SE}(A)^2 + \text{SE}(B)^2 + \cdots}$$

where, 

$$\text{SE} (A) = \frac{\text{margin of error}}{1.645} $$

and, for calculating the standard error of some percentage $P = \frac{A}{B}$, where $A$ is a subset of $B$, we have, 

$$ \text{SE} (P) = \text{SE} \left (\frac{A}{B} \right ) = \frac{1}{B} \sqrt{\text{SE}(A)^2 - P^2 \cdot \text{SE}(B)^2}$$

In [31]:
def aggregate_vars_to_comm(df, values, errors, field_name, total = "Estimate; Total:", total_error = "Margin of Error; Total:"):
    ''' given a dataframe, a list of value columns and a list of error columns, 
        combines values and errors appropriately by column, aggregates tracts to community areas '''
    
    values_per_tract = df.loc[:, values].sum(axis = 1)
    error_per_tract = df.loc[:, errors].apply(lambda x: np.linalg.norm(x / 1.645), axis = 1)
    
    total = df.loc[:, total]
    total_error = df.loc[:, total_error].apply(lambda x: x / 1.645)
    
    df_per_tract = pd.concat([total, values_per_tract, total_error, error_per_tract], axis = 1)
    df_per_tract.columns = ["Total", "Value", "Total Error", "Value Error"]
    
    # column of community areas to group by for aggregation
    df_per_tract["Community Area"] = df_per_tract.index.map(map_to_community)
    df_community = df_per_tract.groupby("Community Area")
    
    percentage = df_community["Value"].sum() / df_community["Total"].sum()
    
    # first aggregate to community areas, then use suggested ACS approximation for estimating percentage error
    value_error_community = df_community["Value Error"].apply(lambda x: np.linalg.norm(x))
    total_error_community = df_community["Total Error"].apply(lambda x: np.linalg.norm(x))
    
    # whenever the square root would be negative, switch sign
    radicand = np.square(percentage) * np.square(total_error_community)
    radicand[radicand > np.square(value_error_community)] = -radicand[radicand > np.square(value_error_community)]
        
    percentage_error = (1 / df_community["Value"].sum()) * np.sqrt(np.square(value_error_community) - radicand)
    # multiplying again by census factor to convert from standard error to margin of error (90% confidence)
    aggregated = pd.concat([percentage, percentage_error * 1.645], axis = 1)
    aggregated.columns = ["Estimated Percentage", "Estimated Margin of Error for Percentage"]
    aggregated.columns = ["{} of {}".format(col, field_name) for col in aggregated.columns]
    
    return aggregated                                                   

And then the seperate function for per capita income, which is nearly identical. Since income is not a subset of population, we can't use the exact same estimation formula for percentages, and instead use the following where the second term under the square root is positive: 

$$ \text{SE} \left ( \frac{A}{B} \right )  = \frac{1}{B} \sqrt{ \text{SE}(A)^2 + \left ( \frac{A}{B} \right )^2 
\cdot \text{SE}(B)^2 } $$



In [34]:
def aggregate_income(acs, year):
    # income per census tract, margin of error
    income = ['Estimate; Aggregate income in the past 12 months (in {} inflation-adjusted dollars)'.format(year),
             'Margin of Error; Aggregate income in the past 12 months (in {} inflation-adjusted dollars)'.format(year)]
    
    income_per_tract = acs["income"].loc[:, income[0]]
    income_error = acs["income"].loc[:, income[1]]
    
    # population per census tract
    population = acs["age"].loc[:, "Estimate; Total:"]
    population_error = acs["age"].loc[:, "Margin of Error; Total:"]

    df_per_tract = pd.concat([income_per_tract, income_error, population, population_error], axis = 1)
    df_per_tract.columns = ["Income", "Income Error", "Population", "Population Error"]
    
    # there are tracts with 0 populations, have to set their values accordingly
    df_per_tract[(df_per_tract["Population"] == 0)] = 0
    df_per_tract["Income"] = df_per_tract["Income"].astype(np.int)
    df_per_tract["Income Error"] = df_per_tract["Income Error"].astype(np.int)

    df_per_tract["Community Area"] = df_per_tract.index.map(map_to_community)
    df_community = df_per_tract.groupby("Community Area")
    
    income_per_capita = df_community["Income"].sum() / df_community["Population"].sum()
    
    income_error = df_community["Income Error"].apply(lambda x: np.linalg.norm(x / 1.645))
    population_error = df_community["Population Error"].apply(lambda x: np.linalg.norm(x / 1.645))
    
    per_capita_error = (1 / df_community["Population"].sum()) * np.sqrt(np.square(income_error) + 
                                                                        np.square(income_per_capita) * np.square(population_error))

    aggregated = pd.concat([income_per_capita, per_capita_error], axis = 1)
    aggregated.columns = ["Estimated Income Per Capita", "Margin of Error for Income Per Capita"]
    
    return aggregated

In [37]:
hardship_index = {"age": [dependents, dependents_error, "Dependents (Under the Age of 18 or Over 64)"], 
                  "education": [education, education_error, "People Over 25 with Below High School Education"],
                  "poverty": [poverty, poverty_error, "Households with Income Below Poverty Line in Last 12 Months"],
                  "crowded_housing": [crowded, crowded_error, "Housing Units with More Than One Person Per Room"],
                  "unemployment": [unemployed, unemployed_error, "People Over the Age of 16 That Are Unemployed"]}

In [40]:
hardship_2011 = []
hardship_2012 = []
hardship_2016 = []

for key in hardship_index:
    hardship_2011.append(aggregate_vars_to_comm(acs_2011[key], *hardship_index[key]))
    hardship_2012.append(aggregate_vars_to_comm(acs_2012[key], *hardship_index[key]))
    hardship_2016.append(aggregate_vars_to_comm(acs_2016[key], *hardship_index[key]))
    
hardship_2011.append(aggregate_income(acs_2011, 2011))
hardship_2012.append(aggregate_income(acs_2012, 2012))
hardship_2016.append(aggregate_income(acs_2016, 2016))

hardship_2011 = pd.concat(hardship_2011, axis = 1)
hardship_2012 = pd.concat(hardship_2012, axis = 1)
hardship_2016 = pd.concat(hardship_2016, axis = 1)

The actual index is really just an equal average of each of the values, with each variable first standardized so as to be comparable and between 0 and 100. Here's a description from the appendix of "Divided They Fall: Hardship in America's Cities and Suburbs":

 $$ X = \left (\frac{Y- Y_{\text{min}}}{Y_{\text{max}} - Y_{\text{min}}} \right ) \cdot 100$$
where: 

$X$ = standardized value of component variable (for example, unemployment rate) for each city to be computed.

$Y$ = unstandardized value of component variable for each city. 

$Y_{\text{min}}$ = the minimum value for $Y$ across all cities.

$Y_{\text{max}}$ = the maximum value for $Y$ across all cities.

The ($Y_{\text{max}} - Y_{\text{min}}$ ) part of the formula was reversed to ($Y_{\text{min}} - Y_{\text{max}}$) for the calculation of Income Level so that the resulting ratio would be interpreted consistently with the other ratios — a higher value indicating higher hardship. The formula standardizes each of the component variables so that they are all given equal weight in the composite Intercity Hardship Index. The Index represents the average of the standardized ratios of all six component variables. The Intercity Hardship Index ranges from 0 to 100 with a higher number indicating greater hardship. 

In [42]:
standardize = lambda x: 100 * ((x - np.min(x)) / (np.max(x) - np.min(x)))
estimated_columns = ~hardship_2012.columns.str.contains("Margin")

In [43]:
def get_index(df, standardize = standardize, estimated_columns = estimated_columns):
    standardized = df.loc[:, estimated_columns].drop("-1").apply(standardize)

    # adjust estimated income to make number increase between 0 and 100 as income decreases (match hardship)
    standardized["Estimated Income Per Capita"] = 100 - standardized["Estimated Income Per Capita"]
    
    standardized.index = standardized.index.astype(int)
    return standardized.mean(axis = 1).sort_index()

In [45]:
hardship_index = pd.DataFrame()

hardship_index[2011] = get_index(hardship_2011)
hardship_index[2012] = get_index(hardship_2012)
hardship_index[2016] = get_index(hardship_2016)

#### Save Values for External Use

In [50]:
# census data from individual years
hardship_2011.to_csv("socio_2011.csv")
hardship_2012.to_csv("socio_2012.csv")
hardship_2016.to_csv("socio_2016.csv")

# hardship indices for every year
hardship_index.to_csv("hardship_indices.csv")