In [9]:
import numpy as np
import pandas as pd
import re
import folktables
from folktables import ACSDataSource, ACSIncome, ACSEmployment, ACSHealthInsurance, ACSPublicCoverage

# US Census 2018

In [2]:
src = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
defs = src.get_definitions()

In [3]:
cols = defs.groupby(1).first().reset_index().iloc[:,[0, 4]]
cols.columns = ['code', 'label']
cols.label = cols.label.apply(lambda x: re.sub('[ ]+[\(\,].*', '', x))
cols = {row.code: row.label for _, row in cols.iterrows()}
cols = cols | {
    'AGEP': 'Age',
    'COW': 'Workclass',
    'SCHL': 'Education',
    'MAR': 'Marital Status',
    'OCCP': 'Occupation',
    'WKHP': 'Hours per Week',
    'SEX': 'Gender',
    'RAC1P': 'Race',
    'PINCP': 'Income',
}

## Income

In [5]:
dfs = []
for state in folktables.state_list:
    data = src.get_data(states=[state])
    cats = folktables.generate_categories(features=ACSIncome.features, definition_df=defs)
    X_state, y_state, _ = ACSIncome.df_to_pandas(data, categories=cats)
    df_state = pd.concat([X_state, y_state], axis=1)
    df_state.insert(0, "State", state)
    dfs.append(df_state)
df = pd.concat(dfs, axis=0).reset_index(drop=True)
df = df.rename(columns=cols)
df.to_parquet('acs-income-2018.parquet', index=False)
df

Unnamed: 0,State,Age,Workclass,Education,Marital Status,Occupation,Place of birth,Relationship,Hours per Week,Gender,Race,Income
0,AL,18,Employee of a private for-profit company or bu...,"Some college, but less than 1 year",Never married or under 15 years old,SAL-Cashiers,Georgia/GA,Noninstitutionalized group quarters population,21.0,Female,Black or African American alone,False
1,AL,53,Federal government employee,GED or alternative credential,Never married or under 15 years old,HLS-Orderlies and Psychiatric Aides,Indiana/IN,Institutionalized group quarters population,40.0,Male,White alone,False
2,AL,41,Employee of a private for-profit company or bu...,Regular high school diploma,Never married or under 15 years old,RPR-Industrial And Refractory Machinery Mechanics,Alabama/AL,Noninstitutionalized group quarters population,40.0,Male,White alone,False
3,AL,18,Self-employed in own not incorporated business...,"Some college, but less than 1 year",Never married or under 15 years old,ENT-Coaches and Scouts,Alabama/AL,Noninstitutionalized group quarters population,2.0,Female,White alone,False
4,AL,21,Federal government employee,"1 or more years of college credit, no degree",Never married or under 15 years old,PRT-Police Officers,Florida/FL,Noninstitutionalized group quarters population,50.0,Male,White alone,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1664495,PR,39,Self-employed in own not incorporated business...,Regular high school diploma,Never married or under 15 years old,CON-Construction Laborers,Puerto Rico,Reference person,20.0,Male,White alone,False
1664496,PR,38,Self-employed in own not incorporated business...,Grade 11,Never married or under 15 years old,CLN-Landscaping And Groundskeeping Workers,Puerto Rico,Reference person,32.0,Male,Some Other Race alone,False
1664497,PR,37,Employee of a private for-profit company or bu...,"1 or more years of college credit, no degree",Divorced,PRD-Other Assemblers And Fabricators,Illinois/IL,Unmarried partner,40.0,Female,Two or More Races,False
1664498,PR,47,Employee of a private for-profit company or bu...,Regular high school diploma,Married,"PRD-Miscellaneous Production Workers, Includin...",Puerto Rico,Husband/wife,40.0,Male,Some Other Race alone,False


## Employment

In [7]:
dfs = []
src = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
defs = src.get_definitions()
for state in folktables.state_list:
    data = src.get_data(states=[state])
    cats = folktables.generate_categories(features=ACSEmployment.features, definition_df=defs)
    X_state, y_state, _ = ACSEmployment.df_to_pandas(data, categories=cats)
    df_state = pd.concat([X_state, y_state], axis=1)
    df_state.insert(0, "State", state)
    dfs.append(df_state)
df = pd.concat(dfs, axis=0).reset_index(drop=True)
df = df.rename(columns=cols)
df.to_parquet('acs-employment-2018.parquet', index=False)
df

Unnamed: 0,State,Age,Education,Marital Status,Relationship,Disability recode,Employment status of parents,Citizenship status,Mobility status,Military service,Ancestry recode,Nativity,Hearing difficulty,Vision difficulty,Cognitive difficulty,Gender,Race,Employment status recode
0,AL,19,"Some college, but less than 1 year",Never married or under 15 years old,Noninstitutionalized group quarters population,Without a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"No, different house in US or Puerto Rico",Never served in the military,Single,Native,No,No,No,Female,White alone,False
1,AL,18,"Some college, but less than 1 year",Never married or under 15 years old,Noninstitutionalized group quarters population,Without a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"No, different house in US or Puerto Rico",Never served in the military,Single,Native,No,No,No,Female,Black or African American alone,False
2,AL,53,GED or alternative credential,Never married or under 15 years old,Institutionalized group quarters population,With a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"Yes, same house (nonmovers)",Never served in the military,Multiple,Native,No,No,Yes,Male,White alone,False
3,AL,28,"1 or more years of college credit, no degree",Never married or under 15 years old,Institutionalized group quarters population,Without a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"Yes, same house (nonmovers)","On active duty in the past, but not now",Single,Native,No,No,No,Male,White alone,False
4,AL,25,Grade 9,Never married or under 15 years old,Institutionalized group quarters population,With a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"No, different house in US or Puerto Rico",Never served in the military,Single,Native,No,No,Yes,Female,White alone,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3236102,PR,47,Regular high school diploma,Married,Husband/wife,Without a disability,"N/A (not own child of householder, and not chi...","Born in Puerto Rico, Guam, the U.S. Virgin Isl...","Yes, same house (nonmovers)",Never served in the military,Single,Native,No,No,No,Male,Some Other Race alone,True
3236103,PR,18,"1 or more years of college credit, no degree",Never married or under 15 years old,Biological son or daughter,Without a disability,"N/A (not own child of householder, and not chi...","Born in Puerto Rico, Guam, the U.S. Virgin Isl...","Yes, same house (nonmovers)",Never served in the military,Single,Native,No,No,No,Female,Some Other Race alone,False
3236104,PR,57,Associate's degree,Married,Reference person,Without a disability,"N/A (not own child of householder, and not chi...","Born in Puerto Rico, Guam, the U.S. Virgin Isl...","Yes, same house (nonmovers)",Never served in the military,Single,Native,No,No,No,Female,White alone,False
3236105,PR,66,Grade 9,Married,Husband/wife,Without a disability,"N/A (not own child of householder, and not chi...","Born in Puerto Rico, Guam, the U.S. Virgin Isl...","Yes, same house (nonmovers)",Never served in the military,Single,Native,No,No,No,Male,White alone,False


## Public Coverage

In [11]:
dfs = []
src = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
defs = src.get_definitions()
for state in folktables.state_list:
    data = src.get_data(states=[state])
    cats = folktables.generate_categories(features=ACSPublicCoverage.features, definition_df=defs)
    X_state, y_state, _ = ACSPublicCoverage.df_to_pandas(data, categories=cats)
    df_state = pd.concat([X_state, y_state], axis=1)
    df_state.insert(0, "state", state)
    dfs.append(df_state)
df = pd.concat(dfs, axis=0).reset_index(drop=True)
df = df.rename(columns=cols)
df.to_parquet('acs-publiccoverage-2018.parquet', index=False)
df

Unnamed: 0,state,Age,Education,Marital Status,Gender,Disability recode,Employment status of parents,Citizenship status,Mobility status,Military service,...,Nativity,Hearing difficulty,Vision difficulty,Cognitive difficulty,Income,Employment status recode,State Code based on 2010 Census definitions,Gave birth to child within the past 12 months,Race,Public health coverage recode
0,AL,19,"Some college, but less than 1 year",Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"No, different house in US or Puerto Rico",Never served in the military,...,Native,No,No,No,-1500.0,Not in labor force,Alabama/AL,No,White alone,False
1,AL,18,"Some college, but less than 1 year",Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"No, different house in US or Puerto Rico",Never served in the military,...,Native,No,No,No,1600.0,Not in labor force,Alabama/AL,No,Black or African American alone,False
2,AL,53,GED or alternative credential,Never married or under 15 years old,Male,With a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"Yes, same house (nonmovers)",Never served in the military,...,Native,No,No,Yes,10000.0,Not in labor force,Alabama/AL,N/A (less than 15 years/greater than 50 years/...,White alone,False
3,AL,28,"1 or more years of college credit, no degree",Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"Yes, same house (nonmovers)","On active duty in the past, but not now",...,Native,No,No,No,0.0,Not in labor force,Alabama/AL,N/A (less than 15 years/greater than 50 years/...,White alone,False
4,AL,25,Grade 9,Never married or under 15 years old,Female,With a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"No, different house in US or Puerto Rico",Never served in the military,...,Native,No,No,Yes,0.0,Not in labor force,Alabama/AL,No,White alone,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138284,PR,47,Bachelor's degree,Married,Female,Without a disability,"N/A (not own child of householder, and not chi...",Born in the U.S.,"Yes, same house (nonmovers)",Never served in the military,...,Native,No,No,No,16400.0,Not in labor force,Puerto Rico/PR,No,Some Other Race alone,False
1138285,PR,47,Regular high school diploma,Married,Male,Without a disability,"N/A (not own child of householder, and not chi...","Born in Puerto Rico, Guam, the U.S. Virgin Isl...","Yes, same house (nonmovers)",Never served in the military,...,Native,No,No,No,18700.0,"Civilian employed, at work",Puerto Rico/PR,N/A (less than 15 years/greater than 50 years/...,Some Other Race alone,False
1138286,PR,18,"1 or more years of college credit, no degree",Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...","Born in Puerto Rico, Guam, the U.S. Virgin Isl...","Yes, same house (nonmovers)",Never served in the military,...,Native,No,No,No,0.0,Not in labor force,Puerto Rico/PR,No,Some Other Race alone,False
1138287,PR,57,Associate's degree,Married,Female,Without a disability,"N/A (not own child of householder, and not chi...","Born in Puerto Rico, Guam, the U.S. Virgin Isl...","Yes, same house (nonmovers)",Never served in the military,...,Native,No,No,No,11100.0,Not in labor force,Puerto Rico/PR,N/A (less than 15 years/greater than 50 years/...,White alone,True
