In [1]:
import pandas as pd
import requests



In [7]:
# Define USCB API key & variables of interest
apikey_USCB = 'b83c9e8cbf71b387db3681c39e0d7ef719f6344f'
variables_USCB = ['B19013_001E','B19301_001E','B23025_005E','B23025_003E','B19083_001E','B01003_001E','B01002_001E','B05002_013E','B25077_001E']

In [23]:
def acs_extractor(api_key, year, variables, state_code=None):
    '''
    Extract state-level acs data
    '''
    # Define API Base URL for ACS 1-Year Estimates (from https://api.census.gov/data.html)
    base_url = 'https://api.census.gov/data/{}/acs/acs1'.format(year)
    variables_str = ','.join(variables)
    
    # Define query params based on input
    if state_code is not None:
        params = {
            'get': variables_str,
            'for': f'state:{state_code}',
            'key': api_key
        }
    else:
        params = {
            'get': variables_str,
            'for': f'state:*',
            'key': api_key
        }

    # Make API request
    response = requests.get(base_url, params=params)
    # Store data into df if extraction is succesful
    if response.status_code == 200:
        data = response.json()
        # print(type(data)) #<class 'list'>; return data shows nested list
        df = pd.DataFrame(data[1:], columns=data[0])
    else:
        print(f"Error for year {year}: {response.status_code}")
        df = None

    return df

In [80]:
# Pull ACS data for 2017-2023
years = [2017,2018,2019, 2020, 2021,2022,2023]
acs_data = []

for year in years:
    df = acs_extractor(apikey_USCB,year,variables=variables_USCB,state_code=None)
    if df is not None:
        df['Year'] = year
        acs_data.append(df)
acs_data = pd.concat(acs_data)
# acs_data.to_pickle('assets/acs_data.pkl')
acs_data.to_csv('assets/acs_data.csv', index=False)

# acs_data = pd.read_pickle('assets/acs_data.pkl')
acs_data = pd.read_csv('assets/acs_data.csv')
acs_data.head()

Error for year 2020: 404


Unnamed: 0,B19013_001E,B19301_001E,B23025_005E,B23025_003E,B19083_001E,B01003_001E,B01002_001E,B05002_013E,B25077_001E,state,Year
0,43529,23121,92744,1319719,0.479,2984100,37.5,65718,120200,28,2017
1,53578,29438,141163,3061464,0.4618,6113532,38.5,257102,156700,29,2017
2,53386,29428,18844,538121,0.4539,1050493,40.0,23109,231300,30,2017
3,59970,30915,34967,1043919,0.4391,1920076,36.5,143331,155800,31,2017
4,58003,30166,90032,1514888,0.4606,2998039,38.0,596019,258200,32,2017


In [81]:
acs_data.columns

Index(['B19013_001E', 'B19301_001E', 'B23025_005E', 'B23025_003E',
       'B19083_001E', 'B01003_001E', 'B01002_001E', 'B05002_013E',
       'B25077_001E', 'state', 'Year'],
      dtype='object')

In [85]:
# Next, match state code and also the column names 
# - Find union of variables for all the want years

# Initialize an empty set to store the union of variables across years
union_of_variables = set()

for year in years:
    # Step 1: Fetch the variable descriptions (what each variable code means)
    url_variables = f"https://api.census.gov/data/{year}/acs/acs1/variables.json"
    response = requests.get(url_variables)
    if response.status_code == 200:
        # Parse the JSON response
        acs_variables = response.json()
        
        # Extract the variable codes (the dictionary keys)
        variable_codes = set(acs_variables['variables'].keys())
        
        # Take the union of the variables across years
        union_of_variables = union_of_variables.union(variable_codes)
        
        print(f"Processed variables for {year}")
    else:
        print(f"Failed to fetch variables for {year}. Status code: {response.status_code}")

# Convert the union of variables to a list and create a DataFrame
union_of_variables_df = pd.DataFrame(list(union_of_variables), columns=['Variable_Code'])

# Optionally, save this union of variables to a CSV file
union_of_variables_df.to_csv('union_of_variables_2017_2023.csv', index=False)

# Display the first few rows of the DataFrame
print(union_of_variables_df.head())

# Convert the 'variables' part of the JSON response into a DataFrame
acs_variables = pd.DataFrame.from_dict(acs_variables['variables'], orient='index')

# Optionally, you can save this DataFrame to a CSV for reference
acs_variables.rename(columns={'Unnamed: 0':'Code'}, inplace=True)
acs_variables.to_csv('assets/acs_variables.csv', index=True)

# Display the variable descriptions DataFrame
# This will contain the variable code, label, and other metadata
acs_variables = pd.read_csv('assets/acs_variables.csv', index_col=0)
acs_variables.head(10)

Processed variables for 2017
Processed variables for 2018
Processed variables for 2019
Failed to fetch variables for 2020. Status code: 404
Processed variables for 2021
Processed variables for 2022
Processed variables for 2023
  Variable_Code
0   B07204_017E
1   B04005_059E
2   C08134_026E
3  B17020A_001E
4  B24010C_009E


Unnamed: 0,label,concept,predicateType,group,limit,predicateOnly,hasGeoCollectionSupport,attributes,required
for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,True,,,
in,Census API FIPS 'in' clause,Census API Geography Specification,fips-in,,0,True,,,
ucgid,Uniform Census Geography Identifier clause,Census API Geography Specification,ucgid,,0,True,True,,
B24022_060E,Estimate!!Total:!!Female:!!Service occupations...,Sex by Occupation and Median Earnings in the P...,int,B24022,0,,,"B24022_060EA,B24022_060M,B24022_060MA",
B19001B_014E,"Estimate!!Total:!!$100,000 to $124,999",Household Income in the Past 12 Months (in 202...,int,B19001B,0,,,"B19001B_014EA,B19001B_014M,B19001B_014MA",
B07007PR_019E,Estimate!!Total:!!Moved from different municip...,Geographical Mobility in the Past Year by Citi...,int,B07007PR,0,,,"B07007PR_019EA,B07007PR_019M,B07007PR_019MA",
B19101A_004E,"Estimate!!Total:!!$15,000 to $19,999",Family Income in the Past 12 Months (in 2023 I...,int,B19101A,0,,,"B19101A_004EA,B19101A_004M,B19101A_004MA",
B24022_061E,Estimate!!Total:!!Female:!!Service occupations...,Sex by Occupation and Median Earnings in the P...,int,B24022,0,,,"B24022_061EA,B24022_061M,B24022_061MA",
B19001B_013E,"Estimate!!Total:!!$75,000 to $99,999",Household Income in the Past 12 Months (in 202...,int,B19001B,0,,,"B19001B_013EA,B19001B_013M,B19001B_013MA",
B07007PR_018E,Estimate!!Total:!!Moved from different municip...,Geographical Mobility in the Past Year by Citi...,int,B07007PR,0,,,"B07007PR_018EA,B07007PR_018M,B07007PR_018MA",


In [88]:
# filter only meta data for variables of interest
vars_present = [var for var in list(acs_data.columns) if (var not in ['state','Year']) & (var in acs_variables.index)]
vars_missing = [var for  var in list(acs_data.columns) if (var not in ['state','Year']) & (var not in acs_variables.index)]
vars_missing #['B23025_005E', 'B23025_003E', 'B19083_001E']
acs_variables = acs_variables.loc[vars_present]
code_mapper = acs_variables.iloc[:,1].to_dict()
code_mapper

{'B19013_001E': 'Median Household Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars)',
 'B19301_001E': 'Per Capita Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars)',
 'B23025_005E': 'Employment Status for the Population 16 Years and Over',
 'B23025_003E': 'Employment Status for the Population 16 Years and Over',
 'B19083_001E': 'Gini Index of Income Inequality',
 'B01003_001E': 'Total Population',
 'B01002_001E': 'Median Age by Sex',
 'B05002_013E': 'Place of Birth by Nativity and Citizenship Status',
 'B25077_001E': 'Median Value (Dollars)'}

In [90]:
acs_data.drop(columns=vars_missing, inplace=True)
acs_data.rename(columns=code_mapper, inplace=True)

In [91]:
acs_data


Unnamed: 0,Median Household Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars),Per Capita Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars),Employment Status for the Population 16 Years and Over,Employment Status for the Population 16 Years and Over.1,Gini Index of Income Inequality,Total Population,Median Age by Sex,Place of Birth by Nativity and Citizenship Status,Median Value (Dollars),state,Year
0,43529,23121,92744,1319719,0.4790,2984100,37.5,65718,120200,28,2017
1,53578,29438,141163,3061464,0.4618,6113532,38.5,257102,156700,29,2017
2,53386,29428,18844,538121,0.4539,1050493,40.0,23109,231300,30,2017
3,59970,30915,34967,1043919,0.4391,1920076,36.5,143331,155800,31,2017
4,58003,30166,90032,1514888,0.4606,2998039,38.0,596019,258200,32,2017
...,...,...,...,...,...,...,...,...,...,...,...
307,94605,52011,182384,4069534,0.4703,7812880,38.6,1213933,576000,53,2023
308,55948,32766,35853,793158,0.4679,1770071,42.8,32326,163700,54,2023
309,74631,41785,86959,3143490,0.4479,5910955,40.5,307899,272500,55,2023
310,72415,39966,9522,294568,0.4455,584057,39.3,21278,298700,56,2023


In [94]:
# map state code to state name
state_code_to_name = {
    '01': 'Alabama', '02': 'Alaska', '04': 'Arizona', '05': 'Arkansas', '06': 'California',
    '08': 'Colorado', '09': 'Connecticut', '10': 'Delaware', '11': 'District of Columbia',
    '12': 'Florida', '13': 'Georgia', '15': 'Hawaii', '16': 'Idaho', '17': 'Illinois',
    '18': 'Indiana', '19': 'Iowa', '20': 'Kansas', '21': 'Kentucky', '22': 'Louisiana',
    '23': 'Maine', '24': 'Maryland', '25': 'Massachusetts', '26': 'Michigan', '27': 'Minnesota',
    '28': 'Mississippi', '29': 'Missouri', '30': 'Montana', '31': 'Nebraska', '32': 'Nevada',
    '33': 'New Hampshire', '34': 'New Jersey', '35': 'New Mexico', '36': 'New York', '37': 'North Carolina',
    '38': 'North Dakota', '39': 'Ohio', '40': 'Oklahoma', '41': 'Oregon', '42': 'Pennsylvania',
    '44': 'Rhode Island', '45': 'South Carolina', '46': 'South Dakota', '47': 'Tennessee', 
    '48': 'Texas', '49': 'Utah', '50': 'Vermont', '51': 'Virginia', '53': 'Washington', 
    '54': 'West Virginia', '55': 'Wisconsin', '56': 'Wyoming'
}
acs_data.rename(columns={'state':'state_code'}, inplace=True)
acs_data['state_code'] = acs_data['state_code'].astype('str')
acs_data['state_name']=acs_data['state_code'].map(state_code_to_name)
acs_data

Unnamed: 0,Median Household Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars),Per Capita Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars),Employment Status for the Population 16 Years and Over,Employment Status for the Population 16 Years and Over.1,Gini Index of Income Inequality,Total Population,Median Age by Sex,Place of Birth by Nativity and Citizenship Status,Median Value (Dollars),state_code,Year,state_name
0,43529,23121,92744,1319719,0.4790,2984100,37.5,65718,120200,28,2017,Mississippi
1,53578,29438,141163,3061464,0.4618,6113532,38.5,257102,156700,29,2017,Missouri
2,53386,29428,18844,538121,0.4539,1050493,40.0,23109,231300,30,2017,Montana
3,59970,30915,34967,1043919,0.4391,1920076,36.5,143331,155800,31,2017,Nebraska
4,58003,30166,90032,1514888,0.4606,2998039,38.0,596019,258200,32,2017,Nevada
...,...,...,...,...,...,...,...,...,...,...,...,...
307,94605,52011,182384,4069534,0.4703,7812880,38.6,1213933,576000,53,2023,Washington
308,55948,32766,35853,793158,0.4679,1770071,42.8,32326,163700,54,2023,West Virginia
309,74631,41785,86959,3143490,0.4479,5910955,40.5,307899,272500,55,2023,Wisconsin
310,72415,39966,9522,294568,0.4455,584057,39.3,21278,298700,56,2023,Wyoming


In [95]:
acs_data.to_csv('assets/acs_data_final.csv', index=False)
acs_data = pd.read_csv('assets/acs_data_final.csv')

In [96]:
acs_data

Unnamed: 0,Median Household Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars),Per Capita Income in the Past 12 Months (in 2023 Inflation-Adjusted Dollars),Employment Status for the Population 16 Years and Over,Employment Status for the Population 16 Years and Over.1,Gini Index of Income Inequality,Total Population,Median Age by Sex,Place of Birth by Nativity and Citizenship Status,Median Value (Dollars),state_code,Year,state_name
0,43529,23121,92744,1319719,0.4790,2984100,37.5,65718,120200,28,2017,Mississippi
1,53578,29438,141163,3061464,0.4618,6113532,38.5,257102,156700,29,2017,Missouri
2,53386,29428,18844,538121,0.4539,1050493,40.0,23109,231300,30,2017,Montana
3,59970,30915,34967,1043919,0.4391,1920076,36.5,143331,155800,31,2017,Nebraska
4,58003,30166,90032,1514888,0.4606,2998039,38.0,596019,258200,32,2017,Nevada
...,...,...,...,...,...,...,...,...,...,...,...,...
307,94605,52011,182384,4069534,0.4703,7812880,38.6,1213933,576000,53,2023,Washington
308,55948,32766,35853,793158,0.4679,1770071,42.8,32326,163700,54,2023,West Virginia
309,74631,41785,86959,3143490,0.4479,5910955,40.5,307899,272500,55,2023,Wisconsin
310,72415,39966,9522,294568,0.4455,584057,39.3,21278,298700,56,2023,Wyoming
