In [20]:
import pandas as pd
import numpy as np
from folktables import ACSDataSource, ACSEmployment, ACSIncomePovertyRatio, ACSMobility, ACSIncome, ACSHealthInsurance, ACSPublicCoverage, ACSTravelTime, generate_categories
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import os


In [2]:
path = './data/2018/1-Year/psam_p04.csv'

In [3]:
df = pd.read_csv(path)

In [48]:
data_source = ACSDataSource(
    survey_year='2018', horizon='1-Year', survey='person')

west_states = ["CA", "OR", "WA", "NV", "AZ"]
east_states = ['ME', 'NH', 'MA', 'RI', 'CT', 'NY',
                'NJ', 'DE', 'MD', 'VA', 'NC', 'SC', 'GA', 'FL']
acs_data = data_source.get_data(states=west_states, download=True)

acss = ['acs_west_poverty', 'acs_west_mobility', 'acs_west_income', 'acs_west_insurance',
        'acs_west_public', 'acs_west_travel', 'acs_west_poverty', 'acs_west_employment']
param = [128, 4, 8, 16, 32, 64, 128, 256]
folks = [ACSIncomePovertyRatio, ACSMobility, ACSIncome, ACSHealthInsurance,
            ACSPublicCoverage, ACSTravelTime, ACSIncomePovertyRatio, ACSEmployment]




acs_task, task_name, seed = folks[0], acss[0], param[0]
#pprint(inspect.getmembers(acs_task))
group_var = acs_task.group
target_var = acs_task.target
groups_to_keep = [1, 2]
acs_data = acs_data.loc[acs_data[group_var].isin(groups_to_keep)]
dataX, dataY, dataA = acs_task.df_to_pandas(acs_data)
# taking catoegorical features
definition_df = data_source.get_definitions(download=True)
categories = generate_categories(features=acs_task.features, definition_df=definition_df)
categories_cols = categories.keys()
# taking non-categorical features
non_categorical_cols = [col for col in dataX.columns if col not in categories_cols]
dataX[non_categorical_cols] = dataX[non_categorical_cols].astype(float)
# use standard scaler to scale non_categorical_cols
pipe = Pipeline([('scaler', StandardScaler())])
dataX[non_categorical_cols] = pipe.fit_transform(dataX[non_categorical_cols])
# use one-hot encoding to encode categorical_cols
dataX = pd.get_dummies(dataX, columns=categories_cols)
# process dataA and dataY
columnsY = dataY.columns
columnsA = dataA.columns
#categorize dataA
dataA = dataA.apply(lambda x: x.astype('category').cat.codes)
dataY = dataY.apply(lambda x: x.astype('category').cat.codes)
df = create_dataset_ref(dataX, dataA, dataY)
# # create folder with task_name if not exist
if not os.path.exists('{}'.format(task_name)):
    os.makedirs('{}'.format(task_name))
path = '{}/dataset_ref.csv'.format(task_name)
df.to_csv(path, index=False)

{'SCHL': {1.0: 'No schooling completed', 2.0: 'Nursery school, preschool', 3.0: 'Kindergarten', 4.0: 'Grade 1', 5.0: 'Grade 2', 6.0: 'Grade 3', 7.0: 'Grade 4', 8.0: 'Grade 5', 9.0: 'Grade 6', 10.0: 'Grade 7', 11.0: 'Grade 8', 12.0: 'Grade 9', 13.0: 'Grade 10', 14.0: 'Grade 11', 15.0: '12th grade - no diploma', 16.0: 'Regular high school diploma', 17.0: 'GED or alternative credential', 18.0: 'Some college, but less than 1 year', 19.0: '1 or more years of college credit, no degree', 20.0: "Associate's degree", 21.0: "Bachelor's degree", 22.0: "Master's degree", 23.0: "Professional degree beyond a bachelor's degree", 24.0: 'Doctorate degree', nan: 'N/A (less than 3 years old)'}, 'MAR': {1: 'Married', 2: 'Widowed', 3: 'Divorced', 4: 'Separated', 5: 'Never married or under 15 years old', nan: 'N/A'}, 'SEX': {1: 'Male', 2: 'Female', nan: 'N/A'}, 'DIS': {1: 'With a disability', 2: 'Without a disability', nan: 'N/A'}, 'ESP': {1.0: 'Both parents in labor force', 2.0: 'Father only in labor force