In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from folktables import ACSDataSource, ACSEmployment, ACSIncome, ACSPublicCoverage, ACSMobility, ACSTravelTime
import os

pd.set_option('display.max_columns', None)  

### ACS Datasets

In [4]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["NY"], download=True)

folktables = {
    "ACSEmployment": ACSEmployment,
    "ACSIncome": ACSIncome,
    "ACSMobility": ACSMobility,
    "ACSPublicCoverage": ACSPublicCoverage,
    "ACSTravelTime": ACSTravelTime
}

#race_agg_names = {1: 'White',
#                  2: 'Black or African American alone',
#                  3: 'Asian alone',
#                  4: 'Other'}

RAC1P_mapper = {1:1,
                2:2,
                3:4,
                4:4,
                5:4,
                6:3,
                7:4,
                8:4,
                9:4}

#SEX_mapper = {1: 1,
#              2: 2}

#HISP_mapper = {x: 1 if x == 0 else 2 for x in range(0, 24)}

# New codes:
# 1 - White alone
# 2 - Black or African American alone
# 3 - Asian alone
# 4 - Other

for name in list(folktables.keys()):
    df = None
    
    # Add HISP
    folktables[name].features.append('HISP')
    
    features, label, group = folktables[name].df_to_numpy(acs_data)
    feature_names = folktables[name].features
    df = pd.DataFrame(features, columns = feature_names)
    df['RAC1P_recoded'] = df['RAC1P'].map(RAC1P_mapper)
    df['label'] = label

    outdir = f'matrices/{name}'
    if not os.path.exists(outdir):
        os.makedirs(outdir)
        #os.chmod(outdir,rwx)

    # Save full datasets
    X = df.drop(columns=['label'])
    X.to_csv(f'{outdir}/X.csv',index=False)

    y = df['label'].apply(lambda x: 1 if x else 0)
    y.to_csv(f'{outdir}/y.csv',index=False)

    # Save sample datasets
    _ , dfs = train_test_split(df, test_size=0.05,random_state=42)
    dfs.reset_index(inplace=True,drop=True)
    
    X = dfs.drop(columns=['label'])
    X.to_csv(f'{outdir}/Xs.csv',index=False)

    y = dfs['label'].apply(lambda x: 1 if x else 0)
    y.to_csv(f'{outdir}/ys.csv',index=False)


### Portuguese Students

In [5]:
students_df_raw = pd.read_csv('data/students.csv',delimiter=',')
students_df = students_df_raw.copy()

# Pre-processing: make target, code sex, address, parents education
students_df['label'] = students_df['G3'].apply(lambda x: 1 if x < 10 else 0)
students_df['sex'] = students_df['sex'].apply(lambda x: 1 if x == 'F' else 0)
students_df['address'] = students_df['address'].apply(lambda x: 1 if x == 'R' else 0)

# Mapping for Education
# 1: Other
# 2: High school
# 3: University or greater
students_df['parents_education'] = students_df.apply(lambda row: max(row.Medu,row.Fedu), axis=1)

# From the Portuguese students dataset:
# (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)

students_education_recoder = {
    0: 0,
    1: 0,
    2: 0,
    3: 1,
    4: 2,
}

students_df['parents_education'] = students_df['parents_education'].map(students_education_recoder)

def recode_to_binary(x):
    if x == 'yes':
        return 1
    elif x == 'no':
        return 0
    else:
        return x

students_df = students_df.applymap(recode_to_binary)

one_hot_variables = ['Subject','school','famsize','Pstatus','Mjob','Fjob','reason','guardian']
for v in one_hot_variables:
    temp = None # This is likely useless, but I have a reason for keeping it... it will remain a mystery to you, the reader of this code
    temp = pd.get_dummies(students_df[v])
    students_df = pd.merge(students_df,temp.add_suffix(f'_{v}'), how='left',left_index=True, right_index=True)
    students_df.drop(columns=[v],inplace=True)

students_df.drop(columns=['ID'],inplace=True)

outdir = f'matrices/students'
if not os.path.exists(outdir):
    os.makedirs(outdir)
    #os.chmod(outdir,rwx)

X = students_df.drop(columns=['label'])
X.to_csv(f'{outdir}/X.csv',index=False)

y = students_df['label'].apply(lambda x: 1 if x else 0)
y.to_csv(f'{outdir}/y.csv',index=False)

X.head()

Unnamed: 0,sex,age,address,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,parents_education,Math_Subject,Por_Subject,GP_school,MS_school,GT3_famsize,LE3_famsize,A_Pstatus,T_Pstatus,at_home_Mjob,health_Mjob,other_Mjob,services_Mjob,teacher_Mjob,at_home_Fjob,health_Fjob,other_Fjob,services_Fjob,teacher_Fjob,course_reason,home_reason,other_reason,reputation_reason,father_guardian,mother_guardian,other_guardian
0,1,16,0,3,4,1,1,0,0,0,0,0,1,1,1,0,3,2,1,1,4,5,12,15,13,14,2,0,1,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0
1,1,17,0,3,2,1,2,0,0,0,0,1,0,1,1,0,5,3,4,1,3,3,2,17,18,17,1,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
2,0,16,0,1,2,2,1,2,0,0,0,0,0,1,1,0,4,4,5,3,5,5,0,9,8,10,0,0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
3,0,17,0,2,3,2,2,0,0,0,0,1,1,1,1,0,4,4,3,1,1,3,4,14,15,16,1,0,1,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0
4,1,16,0,1,1,2,1,0,0,1,0,0,1,1,0,0,4,3,2,1,4,5,2,12,13,13,0,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0


### Tiawenese Loan Assessment

In [6]:
loans_df_raw = pd.read_csv('data/loans.csv',delimiter=',')
loans_df = loans_df_raw.copy()
loans_df['sex'] = loans_df['SEX'].apply(lambda x: 1 if x == 2 else 0)
loans_df.drop(columns=['SEX'],inplace=True)

# Mapping for Education
# 1: Other
# 2: High school
# 3: University or greater
loans_df['education'] = loans_df['EDUCATION']

# From the dataset description:
# Education (1 = graduate school; 2 = university; 3 = high school; 4 = others)

loans_education_recoder = {
    1: 2,
    2: 2,
    3: 1,
    4: 0,
}

loans_df['education'] = loans_df['education'].map(loans_education_recoder)
loans_df['education'] = loans_df['education'].fillna(0)

loans_df['label'] = loans_df['default payment next month']

outdir = f'matrices/loans'
if not os.path.exists(outdir):
    os.makedirs(outdir)
    #os.chmod(outdir,rwx)

loans_df.drop(columns='ID',inplace=True)

X = loans_df.drop(columns=['label'])
X.to_csv(f'{outdir}/X.csv',index=False)

y = loans_df['label'].apply(lambda x: 1 if x else 0)
y.to_csv(f'{outdir}/y.csv',index=False)

X.head()

Unnamed: 0,LIMIT_BAL,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,sex,education
0,20000,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1,1,2.0
1,120000,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,1,2.0
2,90000,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,1,2.0
3,50000,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,1,2.0
4,50000,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,0,2.0


### Diabetes

In [7]:
diabetes_df_raw = pd.read_csv('data/diabetes.csv',delimiter=',')
diabetes_df = diabetes_df_raw.copy()

# Pre-processing: make target, code sex, address, parents education
diabetes_df['label'] = diabetes_df['class'].apply(lambda x: 1 if x == 'Positive' else 0)
diabetes_df.drop(columns=['class'],inplace=True)

diabetes_df['SEX'] = diabetes_df['Gender'].apply(lambda x: 1 if x == 'Female' else 0)
diabetes_df.drop(columns=['Gender'],inplace=True)

def recode_to_binary(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return x

diabetes_df = diabetes_df.applymap(recode_to_binary)

outdir = f'matrices/diabetes'
if not os.path.exists(outdir):
    os.makedirs(outdir)
    #os.chmod(outdir,rwx)

X = diabetes_df.drop(columns=['label'])
X.to_csv(f'{outdir}/X.csv',index=False)

y = diabetes_df['label'].apply(lambda x: 1 if x else 0)
y.to_csv(f'{outdir}/y.csv',index=False)

X.head()

Unnamed: 0,Age,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,SEX
0,40,0,1,0,1,0,0,0,1,0,1,0,1,1,1,0
1,58,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0
2,41,1,0,0,1,1,0,0,1,0,1,0,1,1,0,0
3,45,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0
4,60,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0


### Heart Disease

In [10]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
heart_disease_df = heart_disease.data.features 
heart_disease_df['label'] = heart_disease.data.targets

outdir = f'matrices/heart_disease'
if not os.path.exists(outdir):
    os.makedirs(outdir)
    #os.chmod(outdir,rwx)

X = heart_disease_df.drop(columns=['label'])
X.to_csv(f'{outdir}/X.csv',index=False)

y = heart_disease_df['label'].apply(lambda x: 1 if x else 0)
y.to_csv(f'{outdir}/y.csv',index=False)

X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
