In [1]:
# dependencies
import pandas as pd

# Extract, Transform, Load (ETL)

In [4]:
# Read the food access data from the S3 bucket into a DataFrame
# food_access_raw_df = pd.read_csv("FoodAccessResearchAtlasData2019.csv", dtype={'CensusTract': str})
food_access_raw_df = pd.read_csv("https://gtbootcamp20230221.s3.amazonaws.com/FoodAccessResearchAtlasData2019.csv", dtype={'CensusTract': str})
food_access_raw_df.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.0,0,...,221.0,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,...,214.0,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,0,...,439.0,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,0,...,904.0,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,...,1126.0,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0


In [5]:
# Get list of columns
list(food_access_raw_df.columns)

['CensusTract',
 'State',
 'County',
 'Urban',
 'Pop2010',
 'OHU2010',
 'GroupQuartersFlag',
 'NUMGQTRS',
 'PCTGQTRS',
 'LILATracts_1And10',
 'LILATracts_halfAnd10',
 'LILATracts_1And20',
 'LILATracts_Vehicle',
 'HUNVFlag',
 'LowIncomeTracts',
 'PovertyRate',
 'MedianFamilyIncome',
 'LA1and10',
 'LAhalfand10',
 'LA1and20',
 'LATracts_half',
 'LATracts1',
 'LATracts10',
 'LATracts20',
 'LATractsVehicle_20',
 'LAPOP1_10',
 'LAPOP05_10',
 'LAPOP1_20',
 'LALOWI1_10',
 'LALOWI05_10',
 'LALOWI1_20',
 'lapophalf',
 'lapophalfshare',
 'lalowihalf',
 'lalowihalfshare',
 'lakidshalf',
 'lakidshalfshare',
 'laseniorshalf',
 'laseniorshalfshare',
 'lawhitehalf',
 'lawhitehalfshare',
 'lablackhalf',
 'lablackhalfshare',
 'laasianhalf',
 'laasianhalfshare',
 'lanhopihalf',
 'lanhopihalfshare',
 'laaianhalf',
 'laaianhalfshare',
 'laomultirhalf',
 'laomultirhalfshare',
 'lahisphalf',
 'lahisphalfshare',
 'lahunvhalf',
 'lahunvhalfshare',
 'lasnaphalf',
 'lasnaphalfshare',
 'lapop1',
 'lapop1share',
 

In [7]:
# Create "StateFIPS" column by slicing "CensusTract" column 
food_access_raw_df["StateFIPS"] = food_access_raw_df["CensusTract"].str.slice(0, 2)
food_access_raw_df.head()


Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP,StateFIPS
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.0,0,...,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0,1
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,...,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0,1
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,0,...,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0,1
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,0,...,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0,1
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,...,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0,1


In [11]:
# Calculate percentage population of each group
food_access_raw_df[["TractLOWI_PCT", "TractKids_PCT", "TractSeniors_PCT", "TractWhite_PCT",
                "TractBlack_PCT", "TractAsian_PCT", "TractNHOPI_PCT", "TractAIAN_PCT",
                "TractOMultir_PCT", "TractHispanic_PCT"]] = food_access_raw_df[["TractLOWI", "TractKids", "TractSeniors", "TractWhite",
                                     "TractBlack", "TractAsian", "TractNHOPI", "TractAIAN", "TractOMultir",
                                     "TractHispanic"]]\
    .apply(lambda x: x/food_access_raw_df['Pop2010']*100).round(2)

# Calculate percentage TractHUNV and TractSNAP  group
food_access_raw_df[["TractHUNV_PCT", "TractSNAP_PCT"]] = food_access_raw_df[["TractHUNV", "TractSNAP"]]\
    .apply(lambda x: x/food_access_raw_df['OHU2010']*100).round(2)

# Display Dataframe
food_access_raw_df.head()

Unnamed: 0,CensusTract,StateFIPS,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,...,TractSeniors_PCT,TractWhite_PCT,TractBlack_PCT,TractAsian_PCT,TractNHOPI_PCT,TractAIAN_PCT,TractOMultir_PCT,TractHispanic_PCT,TractHUNV_PCT,TractSNAP_PCT
0,1001020100,1,Alabama,Autauga County,1,1912,693,0,0.0,0.0,...,11.56,84.83,11.35,0.73,0.0,0.73,2.35,2.3,0.87,14.72
1,1001020200,1,Alabama,Autauga County,1,2170,743,0,181.0,8.34,...,9.86,40.92,56.08,0.23,0.0,0.23,2.53,3.46,11.98,21.0
2,1001020300,1,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,...,13.02,76.37,19.18,0.5,0.15,0.33,3.47,2.58,7.88,13.69
3,1001020400,1,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,...,20.61,93.16,4.4,0.41,0.09,0.25,1.69,1.94,1.22,5.69
4,1001020500,1,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,...,10.46,80.49,13.35,2.75,0.08,0.45,2.88,3.3,5.63,8.3


# 1st DataFrame (71,782 rows)

In [13]:
# Create a subset dataframe with selected columns
first_food_access_df = food_access_raw_df[["CensusTract", "StateFIPS", "State", "County", "Urban", "Pop2010", "OHU2010", "PovertyRate",
                                            "MedianFamilyIncome", "LAhalfand10", "TractLOWI", "TractKids", "TractSeniors", "TractWhite",
                                            "TractBlack", "TractAsian", "TractNHOPI", "TractAIAN", "TractOMultir",
                                            "TractHispanic", "TractHUNV", "TractSNAP", "TractLOWI_PCT", "TractKids_PCT", "TractSeniors_PCT", "TractWhite_PCT",
                                            "TractBlack_PCT", "TractAsian_PCT", "TractNHOPI_PCT", "TractAIAN_PCT",
                                            "TractOMultir_PCT", "TractHispanic_PCT", "TractHUNV_PCT", "TractSNAP_PCT"]]
first_food_access_df.tail()

Unnamed: 0,CensusTract,StateFIPS,State,County,Urban,Pop2010,OHU2010,PovertyRate,MedianFamilyIncome,LAhalfand10,...,TractSeniors_PCT,TractWhite_PCT,TractBlack_PCT,TractAsian_PCT,TractNHOPI_PCT,TractAIAN_PCT,TractOMultir_PCT,TractHispanic_PCT,TractHUNV_PCT,TractSNAP_PCT
72526,56043000200,56,Wyoming,Washakie County,0,3326,1317,9.7,67254.0,1,...,17.83,93.39,0.18,0.45,0.0,0.81,5.17,9.29,4.63,4.86
72527,56043000301,56,Wyoming,Washakie County,1,2665,1154,11.6,64152.0,1,...,14.97,89.19,0.19,0.86,0.0,1.5,8.26,16.74,7.63,3.55
72528,56043000302,56,Wyoming,Washakie County,1,2542,1021,16.3,69605.0,1,...,20.3,90.95,0.43,0.39,0.04,1.02,7.16,16.01,2.25,6.27
72529,56045951100,56,Wyoming,Weston County,0,3314,1322,17.5,74500.0,1,...,15.06,95.93,0.45,0.3,0.03,1.42,1.87,2.75,3.56,2.57
72530,56045951300,56,Wyoming,Weston County,1,3894,1699,17.3,76838.0,1,...,16.69,95.17,0.15,0.26,0.05,1.13,3.24,3.21,2.0,6.47


In [14]:
# Drop rows with NAN values
first_food_access_df = first_food_access_df.dropna()
 
# Reset the index
first_food_access_df = first_food_access_df.reset_index(drop = True)

first_food_access_df.tail()

Unnamed: 0,CensusTract,StateFIPS,State,County,Urban,Pop2010,OHU2010,PovertyRate,MedianFamilyIncome,LAhalfand10,...,TractSeniors_PCT,TractWhite_PCT,TractBlack_PCT,TractAsian_PCT,TractNHOPI_PCT,TractAIAN_PCT,TractOMultir_PCT,TractHispanic_PCT,TractHUNV_PCT,TractSNAP_PCT
71777,56043000200,56,Wyoming,Washakie County,0,3326,1317,9.7,67254.0,1,...,17.83,93.39,0.18,0.45,0.0,0.81,5.17,9.29,4.63,4.86
71778,56043000301,56,Wyoming,Washakie County,1,2665,1154,11.6,64152.0,1,...,14.97,89.19,0.19,0.86,0.0,1.5,8.26,16.74,7.63,3.55
71779,56043000302,56,Wyoming,Washakie County,1,2542,1021,16.3,69605.0,1,...,20.3,90.95,0.43,0.39,0.04,1.02,7.16,16.01,2.25,6.27
71780,56045951100,56,Wyoming,Weston County,0,3314,1322,17.5,74500.0,1,...,15.06,95.93,0.45,0.3,0.03,1.42,1.87,2.75,3.56,2.57
71781,56045951300,56,Wyoming,Weston County,1,3894,1699,17.3,76838.0,1,...,16.69,95.17,0.15,0.26,0.05,1.13,3.24,3.21,2.0,6.47


In [15]:
# Get dataframe information
first_food_access_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71782 entries, 0 to 71781
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CensusTract         71782 non-null  object 
 1   StateFIPS           71782 non-null  object 
 2   State               71782 non-null  object 
 3   County              71782 non-null  object 
 4   Urban               71782 non-null  int64  
 5   Pop2010             71782 non-null  int64  
 6   OHU2010             71782 non-null  int64  
 7   PovertyRate         71782 non-null  float64
 8   MedianFamilyIncome  71782 non-null  float64
 9   LAhalfand10         71782 non-null  int64  
 10  TractLOWI           71782 non-null  float64
 11  TractKids           71782 non-null  float64
 12  TractSeniors        71782 non-null  float64
 13  TractWhite          71782 non-null  float64
 14  TractBlack          71782 non-null  float64
 15  TractAsian          71782 non-null  float64
 16  Trac

In [None]:
# save to cloud 