In [46]:
import pandas as pd
import prince
from mihm.data.process import multi_cat_to_one_hot, binary_to_one_hot, standardize_continuous_cols, convert_categorical_to_ordinal
from mihm.data.trainutils import train_test_split
from mihm.model.mihm import MIHM, IndexPredictionModel
from mihm.model.mihm_dataset import MIHMDataset
from mihm.model.modelutils import get_index_prediction_weights

In [47]:
read_cols = ["zPCPhenoAge_acc", "m_HeatIndex_7d", "age2016", "female", "racethn", "eduy", "ihs_wealthf2016", "pmono", "PNK_pct", 
            "PBcell_pct", "PCD8_Plus_pct", "PCD4_Plus_pct", "PNCD8_Plus_pct",
            "smoke2016", "drink2016", "bmi2016", "tractdis", "urban", "mar_cat2", "psyche2016", "stroke2016", "hibpe2016",
            "diabe2016", "hearte2016", "ltactx2016", "mdactx2016", "vgactx2016", "dep2016", "adl2016", 
            "living2016", "division"]

colname_dict = {
    "zPCPhenoAge_acc": "Pheno Age Accel.", 
    "m_HeatIndex_7d": "mean heat index over 7d", 
    "age2016": "age", 
    "female": "female", 
    "racethn": "race/ethnicity", 
    "eduy": "education (in years)", 
    "ihs_wealthf2016": "household wealth (ihs)", 
    "smoke2016": "smoking status", 
    "drink2016": "drinking status", 
    "bmi2016": "bmi", 
    "tractdis": "tract disadvantage", 
    "urban": "urbanicity", 
    "mar_cat2": "marital status", 
    "psyche2016": "psychiatric conditions", 
    "stroke2016": "stroke", 
    "hibpe2016": "hypertension",
    "diabe2016": "diabetes", 
    "hearte2016": "heart disease", 
    "ltactx2016": "light activity", 
    "mdactx2016": "moderate activity", 
    "vgactx2016": "vigorous activity", 
    "dep2016": "depressive symptoms", 
    "adl2016": "adl limitations", 
    "living2016": "living alone", 
    "division": "census division"
}

df = pd.read_stata('../HeatResilience.dta', columns=read_cols)
df.rename(columns=colname_dict, inplace=True)

In [48]:
df.columns

Index(['Pheno Age Accel.', 'mean heat index over 7d', 'age', 'female',
       'race/ethnicity', 'education (in years)', 'household wealth (ihs)',
       'pmono', 'PNK_pct', 'PBcell_pct', 'PCD8_Plus_pct', 'PCD4_Plus_pct',
       'PNCD8_Plus_pct', 'smoking status', 'drinking status', 'bmi',
       'tract disadvantage', 'urbanicity', 'marital status',
       'psychiatric conditions', 'stroke', 'hypertension', 'diabetes',
       'heart disease', 'light activity', 'moderate activity',
       'vigorous activity', 'depressive symptoms', 'adl limitations',
       'living alone', 'census division'],
      dtype='object')

In [49]:
categorical_cols = ['female', 'race/ethnicity', 'urbanicity', 'marital status', 'psychiatric conditions', 'stroke',
                    'hypertension', 'diabetes', 'heart disease', 'living alone', 'census division']
ordinal_cols = ['smoking status', 'drinking status', 'light activity', 'moderate activity', 'vigorous activity',]
continuous_cols = ['education (in years)', 'household wealth (ihs)', 'age', 'bmi', 
                   'tract disadvantage', 'depressive symptoms', 'adl limitations', 'mean heat index over 7d',
                    'pmono', "PNK_pct", "PBcell_pct", "PCD8_Plus_pct", "PCD4_Plus_pct", "PNCD8_Plus_pct",]
for c in categorical_cols:
    df[c] = df[c].astype("category")
binary_cats = [c for c in categorical_cols if df[c].nunique() <=2]
multi_cats = [c for c in categorical_cols if df[c].nunique() > 2]

In [50]:
# Preprocess df for model
df = binary_to_one_hot(df, binary_cats, dtype="category") # convert binary to one hot
df = multi_cat_to_one_hot(df, multi_cats, dtype="category") # convert multi cat to one hot
df = convert_categorical_to_ordinal(df, ordinal_cols) # convert ordinal to ordinal
df_norm, mean_std_dict = standardize_continuous_cols(df, continuous_cols) # standardize continuous cols
df_norm.dropna(inplace=True) # drop Nan rows

In [60]:
controlled_cols = [
    'mean heat index over 7d',
    "pmono",
    "PNK_pct",
    "PBcell_pct",
    "PCD8_Plus_pct",
    "PCD4_Plus_pct",
    "PNCD8_Plus_pct",
]
interaction_predictors = ['female', 'education (in years)', 'household wealth (ihs)', 
'smoking status', 'drinking status', 'bmi', 'tract disadvantage',
'marital status', 'psychiatric conditions', 'stroke', 'hypertension',
'diabetes', 'heart disease', 'light activity', 'moderate activity',
'vigorous activity', 'depressive symptoms', 'adl limitations',
'living alone', 'race/ethnicity_0. NHW', 'race/ethnicity_1. NHB',
'race/ethnicity_2. Hispanic', 'race/ethnicity_3. Others',
'urbanicity_1. urban', 'urbanicity_2. suurban (code 2)',
'urbanicity_3. ex-urban', 'census division_Northeast',
'census division_Midwest', 'census division_South', 'census division_West']

## Dim reduction using Factor Analysis of Mixed Data (FAMD)

In [73]:
df_inter_pred = df_norm[interaction_predictors]
famd = prince.FAMD(
    n_components=30,  # Number of components to keep (using Kaiser's criterion - keeping everything with eigenvalue > 1)
    n_iter=4,        # Number of iterations for the algorithm
    copy=True,       # Whether to copy the input data or operate in-place
    check_input=True, # Whether to check the input data's consistency
    engine='sklearn',    # Backend computation engine, 'auto' selects the best option
    random_state=42,   # Seed for the random number generator for reproducibility
)
famd = famd.fit(df_inter_pred)
famd.eigenvalues_summary

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11.563,9.25%,9.25%
1,7.674,6.14%,15.39%
2,6.919,5.54%,20.93%
3,6.148,4.92%,25.84%
4,5.99,4.79%,30.64%
5,5.693,4.55%,35.19%
6,5.501,4.40%,39.59%
7,5.011,4.01%,43.60%
8,4.894,3.91%,47.51%
9,4.717,3.77%,51.29%


In [74]:
famd.column_contributions_.style.format('{:.0%}')

component,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
female,0%,0%,0%,0%,1%,0%,0%,0%,0%,0%,0%,1%,0%,0%,0%,0%,0%,0%,1%,0%,0%,1%,0%,2%,2%,0%,0%,0%,0%,15%
education (in years),1%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,1%,3%,1%,8%,2%,3%,1%,6%,3%,6%
household wealth (ihs),1%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,2%,3%,2%,0%,0%,4%,0%,0%,0%,5%
bmi,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,3%,3%,3%,0%,22%
tract disadvantage,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,1%,0%,3%,7%,2%,0%,3%,1%,0%,0%,0%,2%
marital status,0%,0%,0%,0%,0%,0%,1%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,17%,13%,2%,4%,0%,2%,2%
depressive symptoms,1%,0%,0%,0%,0%,0%,1%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,1%,0%,0%,2%,0%,0%,0%,0%,0%
adl limitations,1%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,1%,0%,1%,0%,0%,5%
living alone,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,0%,15%,16%,5%,7%,1%,2%,2%
race/ethnicity_0. NHW,0%,0%,0%,0%,0%,0%,0%,0%,0%,1%,0%,0%,1%,0%,0%,0%,0%,0%,1%,0%,7%,13%,0%,0%,0%,4%,0%,0%,4%,1%


In [68]:
famd.plot(
    df_inter_pred,
    x_component=0,
    y_component=3
)

In [69]:
df_inter_transformed = famd.row_coordinates(df_inter_pred)

In [70]:
df_inter_transformed

component,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.998073,3.875075,1.456238,-0.533709,1.260456,-1.435903,-0.507146,0.204791,-1.388225,-1.713411,...,-2.581673,1.021606,1.241558,0.048496,-0.695234,0.793830,-1.624965,1.646382,0.499644,-0.645205
1,2.945516,-0.896892,0.711006,-2.152067,-2.347087,-2.762774,-2.743948,-0.552824,0.536482,3.128305,...,1.013907,2.178156,2.108249,1.186039,-0.058089,0.237311,-1.599999,2.335083,-0.297185,1.099169
2,-0.863629,3.778378,-5.031652,2.368894,-0.677704,-4.072045,-1.372091,4.858731,1.251206,-1.910866,...,2.607132,0.093474,1.515635,-0.408020,-1.743879,0.776212,0.360007,0.020615,-0.776207,0.558376
3,0.750371,-0.507606,-2.291870,-4.251955,-1.528758,-1.090041,-1.619425,-0.294994,0.668391,2.291167,...,-0.834345,-0.571992,2.868289,0.491270,-0.334024,0.901504,3.023664,-0.951553,4.722729,1.792466
4,-1.515112,-1.059290,-1.164544,0.105683,0.087472,-0.513139,-1.304934,0.178390,-2.677204,1.504490,...,-2.494935,-2.263043,1.316306,0.169417,-0.826753,0.570469,-1.691044,1.830218,0.311373,-0.537828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3320,-1.533744,-1.390245,-1.323388,1.472275,1.077700,-2.676630,0.155001,-0.031543,0.588241,0.959122,...,3.168677,2.295818,-1.391058,2.057188,-0.313211,1.113933,-0.190316,0.170093,-0.181620,0.796007
3321,-2.373459,3.300079,0.896326,0.080264,-3.107007,-0.449235,-2.382957,1.225878,1.139427,-1.410517,...,-1.515173,-1.344800,-0.312886,-1.969842,-0.199992,1.279907,1.337284,0.405532,-1.239534,-0.372516
3322,0.656594,-1.534844,1.631477,-0.452333,-0.367436,-1.467778,4.691968,-2.401192,-1.515711,3.248907,...,-3.399370,0.625866,0.089704,-3.035900,0.917236,-0.290393,-1.216095,1.691499,0.312228,-0.756436
3323,-0.708933,3.060906,0.359805,-1.393168,-2.420109,-2.132159,-1.982417,1.102070,0.798479,0.381095,...,-2.525011,-0.664064,0.303614,-1.524544,-0.078181,-2.172811,-1.475270,-1.412454,0.086910,0.456827


In [72]:
famd.inverse_transform(df_inter_transformed)

NotImplementedError: FAMD inherits from PCA, but this method is not implemented yet

### Train MLP

In [71]:
df_norm.columns

Index(['Pheno Age Accel.', 'mean heat index over 7d', 'age', 'female',
       'education (in years)', 'household wealth (ihs)', 'pmono', 'PNK_pct',
       'PBcell_pct', 'PCD8_Plus_pct', 'PCD4_Plus_pct', 'PNCD8_Plus_pct',
       'smoking status', 'drinking status', 'bmi', 'tract disadvantage',
       'marital status', 'psychiatric conditions', 'stroke', 'hypertension',
       'diabetes', 'heart disease', 'light activity', 'moderate activity',
       'vigorous activity', 'depressive symptoms', 'adl limitations',
       'living alone', 'race/ethnicity_0. NHW', 'race/ethnicity_1. NHB',
       'race/ethnicity_2. Hispanic', 'race/ethnicity_3. Others',
       'urbanicity_1. urban', 'urbanicity_2. suurban (code 2)',
       'urbanicity_3. ex-urban', 'census division_Northeast',
       'census division_Midwest', 'census division_South',
       'census division_West'],
      dtype='object')

In [None]:
# interactor
heat_cont_np = df_norm['mean heat index over 7d'].to_numpy()
# controlled vars
controlled_vars_np = df_norm[controlled_cols].to_numpy()
# interaction input vars
interaction_vars_np = df_norm[interaction_predictors].to_numpy()
# dependent var (label)
pheno_epi_np = df_norm["zPCPhenoAge_acc"].to_numpy()