In [1]:
# dependencies
import pandas as pd

# National & State Summary - ETL

### Data to populate 5 charts comparing sociodemographic factors between US Census Tracts that are food insecure or not based on the LAhalfand10 flag: 
1. % low income households
2. % non-productive age groups (minors & seniors) by population
3. % race/ethnicity distribution by population
4. % households HUNV
5. % SNAP housing units

[documentation](https://www.ers.usda.gov/data-products/food-access-research-atlas/documentation/)


In [3]:
# Read the food access data from the S3 bucket into a DataFrame
df = pd.read_csv("https://gtbootcamp20230221.s3.amazonaws.com/FoodAccessResearchAtlasData2019.csv", dtype={'CensusTract': str})
df.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.0,0,...,221.0,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,...,214.0,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,0,...,439.0,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,0,...,904.0,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,...,1126.0,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0


In [4]:
# create state FIP column for mapping
df['StateFIPS'] = df['CensusTract'].str.slice(0,2)
df.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP,StateFIPS
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.0,0,...,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0,1
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,...,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0,1
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,0,...,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0,1
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,0,...,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0,1
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,...,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0,1


In [36]:
# show names of columns
print(df.columns.tolist())

['CensusTract', 'State', 'County', 'Urban', 'Pop2010', 'OHU2010', 'GroupQuartersFlag', 'NUMGQTRS', 'PCTGQTRS', 'LILATracts_1And10', 'LILATracts_halfAnd10', 'LILATracts_1And20', 'LILATracts_Vehicle', 'HUNVFlag', 'LowIncomeTracts', 'PovertyRate', 'MedianFamilyIncome', 'LA1and10', 'LAhalfand10', 'LA1and20', 'LATracts_half', 'LATracts1', 'LATracts10', 'LATracts20', 'LATractsVehicle_20', 'LAPOP1_10', 'LAPOP05_10', 'LAPOP1_20', 'LALOWI1_10', 'LALOWI05_10', 'LALOWI1_20', 'lapophalf', 'lapophalfshare', 'lalowihalf', 'lalowihalfshare', 'lakidshalf', 'lakidshalfshare', 'laseniorshalf', 'laseniorshalfshare', 'lawhitehalf', 'lawhitehalfshare', 'lablackhalf', 'lablackhalfshare', 'laasianhalf', 'laasianhalfshare', 'lanhopihalf', 'lanhopihalfshare', 'laaianhalf', 'laaianhalfshare', 'laomultirhalf', 'laomultirhalfshare', 'lahisphalf', 'lahisphalfshare', 'lahunvhalf', 'lahunvhalfshare', 'lasnaphalf', 'lasnaphalfshare', 'lapop1', 'lapop1share', 'lalowi1', 'lalowi1share', 'lakids1', 'lakids1share', 'lase

In [56]:
# summarize total number of population or households in selected features by STATE
############ note to team: I did the summary for the first two charts only. 
############ Need to sum the other raw data fields that being with Tract...
state_summary_df = df.groupby(['State','LAhalfand10']).agg({
    'Pop2010': 'sum',
    'OHU2010': 'sum',
    'TractLOWI': 'sum',
    'TractKids': 'sum',
    'TractSeniors': 'sum'
})
# reset index; transform State and LAhalfand10 from index fields to columns
state_summary_df.reset_index(inplace=True)

# change column name
state_summary_df.rename(columns={'State':'Jurisdiction'}, inplace=True)

# display dataframe
state_summary_df

Unnamed: 0,Jurisdiction,LAhalfand10,Pop2010,OHU2010,TractLOWI,TractKids,TractSeniors
0,Alabama,0,2011849,781527,750829.0,477968.0,285743.0
1,Alabama,1,2767887,1102264,1042096.0,654491.0,372049.0
2,Alaska,0,186563,67331,42372.0,46209.0,13900.0
3,Alaska,1,523668,190727,134581.0,138068.0,40634.0
4,Arizona,0,857778,321996,346581.0,209192.0,117376.0
...,...,...,...,...,...,...,...
97,West Virginia,1,921047,386789,345703.0,189943.0,150293.0
98,Wisconsin,0,1953902,765486,532677.0,460000.0,267095.0
99,Wisconsin,1,3733084,1514282,1059719.0,879492.0,510219.0
100,Wyoming,0,137770,52261,39444.0,36212.0,15520.0


In [50]:
# summarize total number of population or households in selected features for US
############ note to team: I did the summary for the first two charts only. 
############ Need to sum the other raw data fields that being with Tract...
us_summary_df = df.groupby(['LAhalfand10']).agg({
    'Pop2010': 'sum',
    'OHU2010': 'sum',
    'TractLOWI': 'sum',
    'TractKids': 'sum',
    'TractSeniors': 'sum'
})
# reset index; transform LAhalfand10 from index field to column
us_summary_df.reset_index(inplace=True)

# add Jurisdiction column
us_summary_df['Jurisdiction'] = 'United States'

# display data
us_summary_df


Unnamed: 0,LAhalfand10,Pop2010,OHU2010,TractLOWI,TractKids,TractSeniors,Jurisdiction
0,0,91132485,34733952,31407468.0,21215438.0,12063938.0,United States
1,1,217613053,81982340,69046369.0,52957586.0,28202843.0,United States


In [55]:
# concatenate state and US total data
summary_df = pd.concat([state_summary_df,us_summary_df])
summary_df.reset_index(inplace=True, drop=True)
summary_df.tail()

Unnamed: 0,Jurisdiction,LAhalfand10,Pop2010,OHU2010,TractLOWI,TractKids,TractSeniors
99,Wisconsin,1,3733084,1514282,1059719.0,879492.0,510219.0
100,Wyoming,0,137770,52261,39444.0,36212.0,15520.0
101,Wyoming,1,425856,174618,117110.0,99190.0,54570.0
102,United States,0,91132485,34733952,31407468.0,21215438.0,12063938.0
103,United States,1,217613053,81982340,69046369.0,52957586.0,28202843.0


## Calculate percentages from totals and add to summary_df
1. % low income households 
    - TractLOWI/OHU2010
2. % non-productive age groups (minors & seniors) by population
    - TractKids/Pop2010
    - TractSeniors/Pop2010
3. % race/ethnicity distribution by population
    - TractWhite/Pop2010
    - TractBlack/Pop2010
    - TractAsian/Pop2010
    - TractNHOPI/Pop2010
    - TractAIAN/Pop2010
    - TractOMultir/Pop2010
    - TractHispanic/Pop2010
4. % households HUNV
    - TractHUNV/OHU2010
5. % SNAP housing units
    - TractSNAP/OHU2010

In [58]:
# Low income household
summary_df['LOWI_per'] = summary_df['TractLOWI']/summary_df['OHU2010']
summary_df = summary_df.sort_values('LOWI_per', ascending=False)

summary_df

########### not sure why some of the percentages are above 100% in the LOWI_per column.
########### is it possible the denom needs to be the population and not the household?
########### suggest someone read the data dictionary/documentation to figure this out.

Unnamed: 0,Jurisdiction,LAhalfand10,Pop2010,OHU2010,TractLOWI,TractKids,TractSeniors,LOWI_per
56,Nevada,0,513082,187861,217782.0,130994.0,60235.0,1.159272
49,Mississippi,1,1951759,735055,836101.0,498648.0,246498.0,1.137467
8,California,0,10178334,3531358,3955197.0,2412064.0,1110581.0,1.120022
86,Texas,0,5011809,1788621,1950025.0,1332143.0,583488.0,1.090239
87,Texas,1,20133752,7134312,7692093.0,5533681.0,2018398.0,1.078183
...,...,...,...,...,...,...,...,...
43,Massachusetts,1,4974468,1921702,1064471.0,1104275.0,716655.0,0.553921
13,Connecticut,1,2770589,1064989,589524.0,626041.0,406291.0,0.553549
61,New Jersey,1,6419693,2361138,1237098.0,1501606.0,921596.0,0.523941
59,New Hampshire,1,718473,284043,144732.0,156100.0,97091.0,0.509543
