In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import copy
from collections import defaultdict
from functools import partial
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import operator

def pkl_load(file_name):
    with open(file_name, "rb") as f:
        data = pkl.load(f)
    return data
    
def pkl_save(data, file_name):
    with open(file_name, "wb") as f:
        pkl.dump(data, f)

In [2]:
project_root = "/Users/alexgre/workspace/py3/NAFLD/clean_project/"

In [3]:
'''
In the dataset here, we changed the previous four patients without NAFLD outcomes to NAFLD outcomes of 1.
The four patients are 4172, 1243x, 1073Tx, 4161.
We made the changing decision based on the data since all other patients around these four patients are with outcomes as 1. 
Therefore, we imputated that these four patients should have NAFLD outcomes as 1. 
'''
dataset = project_root + "NAFLD_change_nan_nafld_to_1.csv"

In [4]:
'''
In the data set we have several columns should not be used in the study:
1. STUDY_CO -> ID
2. WT__KG_ and HEIGHT__ -> use BMI to represent these two variables
3. BODY_SUR -> body surface is not
4. NASH_OUT, FIBROSIS -> unwanted outcomes
'''
exclude_columns = {
    'STUDY_CO', 'WT__KG_', 'HEIGHT__', 'BODY_SUR', 'NASH_OUT', 'FIBROSIS'
}

In [5]:
'''
load csv as dataframe 
'''

df = pd.read_csv(dataset, usecols=lambda col: col not in exclude_columns)
df.head()

Unnamed: 0,NGT_IFG_,NGT__PRE,DM_STATU,MS,AGE,GNDR,ETHNICIT,BMI,HOMA,SBP,...,AST_M0,ALT_M0,BILIRRUB,PLATELET,ALBUMIN,TSH,MEAN_FPG,NASH_OU2,FIBROSI2,NAFLD_OU
0,DIAB,DIAB,1.0,,43.0,0.0,AA,28.926031,,139.0,...,14.0,11.0,,313.0,4.9,0.8,,0,0,0
1,IFG,PreDM,0.0,1.0,64.0,1.0,Caucasian,35.660156,1.993351,126.0,...,36.0,28.0,0.6,257.0,3.8,2.76,123.5,0,0,0
2,DIAB,DIAB,1.0,1.0,61.0,1.0,Caucasian,28.132415,,104.0,...,14.0,15.0,0.9,289.0,4.4,1.12,,0,0,0
3,DIAB,DIAB,1.0,,53.0,0.0,AA,30.413217,3.283951,157.0,...,20.0,23.0,0.2,316.0,4.4,0.5,133.0,0,0,0
4,DIAB,DIAB,1.0,,71.0,1.0,AA,28.605187,2.955556,143.0,...,20.0,21.0,0.5,174.0,4.4,2.53,133.0,0,0,0


In [13]:
df.describe()

Unnamed: 0,DM_STATU,MS,AGE,GNDR,BMI,HOMA,SBP,DBP,ON_BP_ME,CHOL,...,AST_M0,ALT_M0,BILIRRUB,PLATELET,ALBUMIN,TSH,MEAN_FPG,NASH_OU2,FIBROSI2,NAFLD_OU
count,492.0,328.0,492.0,492.0,492.0,442.0,467.0,467.0,411.0,492.0,...,490.0,490.0,401.0,438.0,419.0,396.0,463.0,492.0,492.0,492.0
mean,0.660569,0.838415,54.254065,0.723577,33.415089,4.399167,132.233405,77.655246,0.690998,173.965447,...,35.891837,47.377551,0.734165,230.687215,4.167064,2.093955,129.338013,0.402439,0.121951,0.703252
std,0.473998,0.368632,10.627103,0.447684,5.208212,4.183055,15.773175,9.803378,0.462645,41.405707,...,21.973923,34.030996,0.389493,58.616587,0.367994,2.31014,39.636838,0.490889,0.327563,0.457289
min,0.0,0.0,24.0,0.0,18.690085,0.290329,93.0,46.0,0.0,80.0,...,9.0,9.0,0.1,95.0,3.3,0.1,63.0,0.0,0.0,0.0
25%,0.0,1.0,47.0,0.0,29.836113,1.573109,122.0,71.0,0.0,144.0,...,22.0,23.0,0.5,190.0,3.9,1.1575,103.0,0.0,0.0,0.0
50%,1.0,1.0,56.0,1.0,33.077604,3.130597,131.0,78.0,1.0,170.0,...,29.0,38.0,0.7,228.0,4.2,1.7,118.0,0.0,0.0,1.0
75%,1.0,1.0,63.0,1.0,36.876387,5.399716,141.0,84.0,1.0,199.0,...,44.0,59.0,0.9,266.0,4.4,2.4275,142.375,1.0,0.0,1.0
max,1.0,1.0,74.0,1.0,48.132641,27.332099,205.0,106.0,1.0,359.0,...,183.0,192.0,3.9,487.0,5.4,35.7,332.0,1.0,1.0,1.0


In [14]:
imputated_df.describe()

Unnamed: 0,DM_STATU,MS,AGE,GNDR,BMI,HOMA,SBP,DBP,ON_BP_ME,CHOL,...,AST_M0,ALT_M0,BILIRRUB,PLATELET,ALBUMIN,TSH,MEAN_FPG,NASH_OU2,FIBROSI2,NAFLD_OU
count,492.0,492.0,492.0,492.0,492.0,492.0,492.0,492.0,492.0,492.0,...,492.0,492.0,492.0,492.0,492.0,492.0,492.0,492.0,492.0,492.0
mean,0.660569,0.890244,54.254065,0.723577,33.415089,4.692267,133.609756,78.487805,0.691057,173.965447,...,36.004065,47.686992,0.835366,244.914634,4.180488,2.154809,130.534045,0.402439,0.121951,0.703252
std,0.473998,0.312904,10.627103,0.447684,5.208212,4.31947,16.578215,10.209708,0.462528,41.405707,...,22.027847,34.328525,0.436484,73.064021,0.354007,2.100605,40.085502,0.490889,0.327563,0.457289
min,0.0,0.0,24.0,0.0,18.690085,0.290329,93.0,46.0,0.0,80.0,...,9.0,9.0,0.1,95.0,3.3,0.1,63.0,0.0,0.0,0.0
25%,0.0,1.0,47.0,0.0,29.836113,1.619136,122.0,72.0,0.0,144.0,...,22.0,23.0,0.5,193.0,3.9,1.26,104.0,0.0,0.0,0.0
50%,1.0,1.0,56.0,1.0,33.077604,3.173825,132.0,79.0,1.0,170.0,...,29.0,38.0,0.7,233.0,4.2,1.97,118.75,0.0,0.0,1.0
75%,1.0,1.0,63.0,1.0,36.876387,5.862742,143.0,85.0,1.0,199.0,...,44.25,59.0,1.1,280.0,4.4,2.61,145.0,1.0,0.0,1.0
max,1.0,1.0,74.0,1.0,48.132641,27.332099,205.0,106.0,1.0,359.0,...,183.0,192.0,3.9,487.0,5.4,35.7,332.0,1.0,1.0,1.0


In [6]:
'''
remove patients with too many missing values
patients have missing value in the dataset, to improve the data quality, we set two rules to fileter patients
1. a patients missing variable cannot overpass 15 (total variable number is 29, about half)
2. 
'''

miss_threshold = 15
must_have_variable = {}

In [7]:
def count_nan(row):
    c = 0
    for each in row:
        if pd.isnull(each):
            c += 1
    if c >= miss_threshold:
        return 1
    return 0

df['mc'] = df.apply(count_nan, axis=1)
print("using missing threshold: ", miss_threshold, "filtered # of data: ", len(df[df['mc'] == 1]), "total # of data: ", len(df))
df = df[df['mc'] == 0]
df = df.drop(columns='mc')
df.shape

using missing threshold:  15 filtered # of data:  3 total # of data:  495


(492, 32)

In [8]:
'''
missing data imputation

general methods:
1. random picked from same column (two sub-options: <a> using all values in column; <b> using the values from same outcomes) 
(we adopted this method in this work)
2. median of the column (not a good practice for this research)
3. average of the column (not a good practice for this research)
4. cloest case based on all physical factors (heuristic measure) (need to develop matching algorithm - entropy loss)
'''
def create_imputated_data_lookup(dataframe, value_dict):
    for key in dataframe.keys():
        value_dict[key] = {each for each in dataframe[key] if not pd.isnull(each)}

unique_values_dict = defaultdict(set)
create_imputated_data_lookup(df, unique_values_dict)

In [9]:
def imputate_column_random_pick(row):
    random.seed(13)
    for k, v in row.items():
        if pd.isnull(v):
            row[k] = random.choice(list(unique_values_dict[k]))
    return row

imputated_df = pd.DataFrame()
imputated_df = imputated_df.append(df.apply(imputate_column_random_pick, axis=1))

In [10]:
df.head()

Unnamed: 0,NGT_IFG_,NGT__PRE,DM_STATU,MS,AGE,GNDR,ETHNICIT,BMI,HOMA,SBP,...,AST_M0,ALT_M0,BILIRRUB,PLATELET,ALBUMIN,TSH,MEAN_FPG,NASH_OU2,FIBROSI2,NAFLD_OU
0,DIAB,DIAB,1.0,,43.0,0.0,AA,28.926031,,139.0,...,14.0,11.0,,313.0,4.9,0.8,,0,0,0
1,IFG,PreDM,0.0,1.0,64.0,1.0,Caucasian,35.660156,1.993351,126.0,...,36.0,28.0,0.6,257.0,3.8,2.76,123.5,0,0,0
2,DIAB,DIAB,1.0,1.0,61.0,1.0,Caucasian,28.132415,,104.0,...,14.0,15.0,0.9,289.0,4.4,1.12,,0,0,0
3,DIAB,DIAB,1.0,,53.0,0.0,AA,30.413217,3.283951,157.0,...,20.0,23.0,0.2,316.0,4.4,0.5,133.0,0,0,0
4,DIAB,DIAB,1.0,,71.0,1.0,AA,28.605187,2.955556,143.0,...,20.0,21.0,0.5,174.0,4.4,2.53,133.0,0,0,0


In [11]:
#using random seed as 13 for repeating results
imputated_df.head()

Unnamed: 0,NGT_IFG_,NGT__PRE,DM_STATU,MS,AGE,GNDR,ETHNICIT,BMI,HOMA,SBP,...,AST_M0,ALT_M0,BILIRRUB,PLATELET,ALBUMIN,TSH,MEAN_FPG,NASH_OU2,FIBROSI2,NAFLD_OU
0,DIAB,DIAB,1.0,1.0,43.0,0.0,AA,28.926031,11.061728,139.0,...,14.0,11.0,1.1,313.0,4.9,0.8,224.5,0,0,0
1,IFG,PreDM,0.0,1.0,64.0,1.0,Caucasian,35.660156,1.993351,126.0,...,36.0,28.0,0.6,257.0,3.8,2.76,123.5,0,0,0
2,DIAB,DIAB,1.0,1.0,61.0,1.0,Caucasian,28.132415,1.619136,104.0,...,14.0,15.0,0.9,289.0,4.4,1.12,126.5,0,0,0
3,DIAB,DIAB,1.0,1.0,53.0,0.0,AA,30.413217,3.283951,157.0,...,20.0,23.0,0.2,316.0,4.4,0.5,133.0,0,0,0
4,DIAB,DIAB,1.0,1.0,71.0,1.0,AA,28.605187,2.955556,143.0,...,20.0,21.0,0.5,174.0,4.4,2.53,133.0,0,0,0


In [14]:
'''
We will create two different type of data: 1. leave all variables as them are; 2. convert all the variables to categorical

We will use the categorization criteria as follow defined by Dr. Fernaldo:

ngt_ifg_: NGT=normal glucose tolerance; IFG: impaired fasting glucose; IGT: impaired glucose tolerance; DIAB: T2DM
ngt__pre: NGT=normal glucose tolerance; PreDM: prediabetes; DIAB: T2DM
dm_statu: diabetes status 1=yes 0=no
ms: metabolic syndrome 1=yes 0=no
dyslipid: dyslipidemia 1=yes 0=no
on_stati: use of statins 1=yes 0=no
combinat: use of fibrates, niacin or omega-3 1=yes 0=no
on_metfo: use of metformin 1=yes 0=no
on_su: use of sulfonylureas 1=yes 0=no
gndr: 1=male 0=female
ageI would try >50, >60, >70
ethnicit Not continuous
bmi
body_sur: body surface (I would remove this variable as it will not be available in epic to then extrapolate the model)
homa (I would remove this variable as it will not be available in epic to then extrapolate the model)
sbp: systolic blood pressure >130 or >140 (try 130 first)
dbp: diastolic: >80 or >90 (try 80 first)
on_bp_me: use of BP meds (1= yes 0=no) Not continuous
chol: total cholesterol: >200
tg: triglycerides: >150
ldl: >100, >130, >160
hdl: <40 for males and <50 for females
a1c: We can try >=5.7%, >=6.0%, >=6.5%, or >=7.0  (using <5.7, 5.7-6.5, >6.5)
ast_m0: AST. Use >=40
alt_m0: ALT. Use >=40, You can also try with >=30 for male and >=19 for female
bilirrub: maybe >=1 (all patients may have lower results than this).
platelet: We can try >=100 or >=150
albumin: >=4
tsh:>=4
mean_fpg: fasting plasma glucose. We can try >=100, >=126, or >=150

categorization using the conf below:
create config json cat -> categorical; ex -> exclude variable; continuous variable represent as list -> [1,2] define 3 ranges as (, 1] & (1, 2] & (2, ) 
same variable is gender depended using dictionary -> {1:[] #male, 0:[]#female}
outcome identify this is the predictor
'''

conf = {
 'NGT_IFG_': 'cat',
 'NGT__PRE': 'cat',
 'DM_STATU': 'cat',
 'MS': 'cat',
 'AGE': [50.0, 60.0, 70.0],
 'GNDR': 'cat',
 'ETHNICIT': 'cat',
 'BMI': [18.5, 24.9, 29.9],
 'HOMA': [2.6, 3.8],
 'SBP': [130.0],
 'DBP': [80.0],
 'ON_BP_ME': 'cat',
 'CHOL': [200.0],
 'TG': [150.0],
 'LDL': [100.0, 130.0, 160.0],
 'HDL': {'1':[40.0], '0':[50.0]},
 'DYSLIPID': 'cat',
 'ON_STATI': 'cat',
 'COMBINAT': 'cat',
 'A1C': [5.7, 6.5],
 'ON_METFO': 'cat',
 'ON_SU': 'cat',
 'AST_M0': [40.0],
 'ALT_M0': {'1':[19.0], '0':[30.0]},
 'BILIRRUB': [1.0],
 'PLATELET': [150.0],
 'ALBUMIN': [4.0],
 'TSH': [4.0],
 'MEAN_FPG': [100.0, 125.0],
 'NAFLD_OU': 'outcome',
 'NASH_OU2':'outcome',
 'FIBROSI2': 'outcome'}

pkl_save(conf, project_root+"NAFLD_categorization_criteria.pkl")

In [15]:
'''
create data using the variables as they are
train_X, test_X can be shared with all outcomes, each outcome should have its own train_y and test_y
we need to prepared two set of data: one with HOMA; one without HOMA
'''
def fc(s):
    return {each:i for i, each in enumerate(s)}

def xys(l):
    return list(zip(*l))

In [16]:
# map variables to column idx and vice versa
var2idx = {e:i for i, e in enumerate(imputated_df.keys()) if conf[e] != "outcome"}
idx2var = {v:k for k,v in var2idx.items()}

# create categorical mapping table to map string to int
cat_feature_level_dict = defaultdict(set)

#convert categorical to OHE 
for k, v in conf.items():
    if v == "cat":
        cat_feature_level_dict[k] = fc(set(imputated_df[k]))   

In [17]:
#create features for each variable in the mix_data_X
mix_features = []

for col in imputated_df.keys():
    if conf[col] == 'outcome':
        continue
    
    if col in cat_feature_level_dict:
        sub_dict = cat_feature_level_dict[col]
        sorted_sub_dict = sorted(sub_dict.items(), key=operator.itemgetter(1))
        for each in sorted_sub_dict:
            mix_features.append(f"{col}_{each[0]}")
    else:
        mix_features.append(col)
        
print(mix_features, len(mix_features))

['NGT_IFG__IFG', 'NGT_IFG__NGT', 'NGT_IFG__IGT', 'NGT_IFG__DIAB', 'NGT__PRE_PreDM', 'NGT__PRE_NGT', 'NGT__PRE_DIAB', 'DM_STATU_0.0', 'DM_STATU_1.0', 'MS_0.0', 'MS_1.0', 'AGE', 'GNDR_0.0', 'GNDR_1.0', 'ETHNICIT_PI', 'ETHNICIT_Hispanic', 'ETHNICIT_Asian', 'ETHNICIT_Indian', 'ETHNICIT_Caucasian', 'ETHNICIT_AA', 'BMI', 'HOMA', 'SBP', 'DBP', 'ON_BP_ME_0.0', 'ON_BP_ME_1.0', 'CHOL', 'TG', 'LDL', 'HDL', 'DYSLIPID_0.0', 'DYSLIPID_1.0', 'ON_STATI_0.0', 'ON_STATI_1.0', 'COMBINAT_0.0', 'COMBINAT_1.0', 'A1C', 'ON_METFO_0.0', 'ON_METFO_1.0', 'ON_SU_0.0', 'ON_SU_1.0', 'AST_M0', 'ALT_M0', 'BILIRRUB', 'PLATELET', 'ALBUMIN', 'TSH', 'MEAN_FPG'] 48


In [18]:
#convert data
mix_data = []

cols = imputated_df.keys()
for idx, row in imputated_df.iterrows():
    data_point = []
    
    for col in cols:
        val = row[col]
        if col in cat_feature_level_dict:
            cat = list(np.zeros(shape=len(cat_feature_level_dict[col]), dtype=int))
            cat[cat_feature_level_dict[col][val]] = 1
            data_point.extend(cat)
        else:
            data_point.append(val)
            
    mix_data.append(data_point)

In [19]:
#create all the x values
mix_data_X = np.array([each[:-3] for each in mix_data])

homa_idx = mix_features.index('HOMA')
mix_data_no_homa_X = np.array([each[:homa_idx] + each[homa_idx+1:-3] for each in mix_data])

mix_data_X[0], mix_data_X.shape, mix_data_no_homa_X.shape

(array([  0.        ,   0.        ,   0.        ,   1.        ,
          0.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,   1.        ,  43.        ,
          1.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   1.        ,
         28.92603107,  11.0617284 , 139.        , 103.        ,
          1.        ,   0.        , 177.        ,  77.        ,
        122.        ,  40.        ,   0.        ,   1.        ,
          0.        ,   1.        ,   1.        ,   0.        ,
          6.1       ,   1.        ,   0.        ,   1.        ,
          0.        ,  14.        ,  11.        ,   1.1       ,
        313.        ,   4.9       ,   0.8       , 224.5       ]),
 (492, 48),
 (492, 47))

In [20]:
#create all y values
mix_data_nafld = np.array([each[-1] for each in mix_data])
mix_data_fib = np.array([each[-2] for each in mix_data])
mix_data_nash = np.array([each[-3] for each in mix_data])
mix_data_nafld[:5], mix_data_nafld.shape

(array([0, 0, 0, 0, 0]), (492,))

In [21]:
#save all data as pkl
pkl_save(mix_data_X, project_root+"train_test_data/mix_X.pkl")
pkl_save(mix_data_no_homa_X, project_root+"train_test_data/mix_no_homa_X.pkl")
pkl_save(mix_data_nafld, project_root+"train_test_data/y_nafld.pkl")
pkl_save(mix_data_nash, project_root+"train_test_data/y_nash.pkl")
pkl_save(mix_data_fib, project_root+"train_test_data/y_fib.pkl")
pkl_save(mix_features, project_root+"train_test_data/mix_features.pkl")
pkl_save(mix_features[:homa_idx] + mix_features[homa_idx+1:], project_root+"train_test_data/mix_features_no_homa.pkl")

In [22]:
'''
convert all variables to categorical features based on the pre-defined range in conf
each level using a unique number
then the data will be convert to One-Hot encoding format
'''

def cat2idx(data:float, bound:list, g:int) -> int:
    idx = 0
    if isinstance(bound, list):
        bound = bound
    elif isinstance(bound, dict):
        bound = bound[str(int(g))]

    while idx < len(bound) and data >= bound[idx]:
        idx += 1
    return idx

def float2cat(col, col_name, genders):
    n_col = []
    for i, each in enumerate(col):
        g = genders[i]
        n_col.append(cat2idx(float(each), conf[col_name], g))
    return np.array(n_col)

def cat2cat(col, col_name, cat_dict):
    le = LabelEncoder()
    encoded = le.fit_transform(col)
    cat_dict[k] = le.classes_
    return encoded

In [23]:
gnrs = list(copy.deepcopy(imputated_df['GNDR']))
categorical_data_dict = dict()
categorical_data_dict_no_homa = dict()

In [24]:
# process data with homa
ohe_df = copy.deepcopy(imputated_df)

for k, v in conf.items():
    if v == "outcome":
        continue
    
    if v =='cat':
        ohe_df[k] = cat2cat(ohe_df[k], k, categorical_data_dict)
    else:
        ohe_df[k] = float2cat(ohe_df[k], k, gnrs)

In [25]:
print(ohe_df.shape)
ohe_df.head()

(492, 32)


Unnamed: 0,NGT_IFG_,NGT__PRE,DM_STATU,MS,AGE,GNDR,ETHNICIT,BMI,HOMA,SBP,...,AST_M0,ALT_M0,BILIRRUB,PLATELET,ALBUMIN,TSH,MEAN_FPG,NASH_OU2,FIBROSI2,NAFLD_OU
0,0,0,1,1,0,0,0,2,2,1,...,0,0,1,1,1,0,2,0,0,0
1,1,2,0,1,2,1,2,3,0,0,...,0,1,0,1,0,0,1,0,0,0
2,0,0,1,1,2,1,2,2,0,0,...,0,0,0,1,1,0,2,0,0,0
3,0,0,1,1,1,0,0,3,1,1,...,0,0,0,1,1,0,2,0,0,0
4,0,0,1,1,3,1,0,2,1,1,...,0,1,0,1,1,0,2,0,0,0


In [26]:
train_ohe = ohe_df.drop(columns=['NASH_OU2', 'FIBROSI2', 'NAFLD_OU'])
train_ohe.shape

(492, 29)

In [27]:
# convert to OHE and output data as pkl file
ohe = OneHotEncoder(sparse=False, categories='auto')
ohe_X = ohe.fit_transform(train_ohe)
ohe_X.shape

(492, 73)

In [28]:
pkl_save(ohe_X, project_root+"train_test_data/ohe_X.pkl")

In [29]:
# process data without homa
ohe_df_no_homa = imputated_df.drop(columns=['NASH_OU2', 'FIBROSI2', 'NAFLD_OU', 'HOMA'])
print(imputated_df.shape, ohe_df_no_homa.shape)

for k, v in conf.items():
    if v == "outcome" or k == 'HOMA':
        continue
    
    if v =='cat':
        ohe_df_no_homa[k] = cat2cat(ohe_df_no_homa[k], k, categorical_data_dict)
    else:
        ohe_df_no_homa[k] = float2cat(ohe_df_no_homa[k], k, gnrs)

(492, 32) (492, 28)


In [30]:
print(ohe_df_no_homa.shape)
ohe_df_no_homa.head()

(492, 28)


Unnamed: 0,NGT_IFG_,NGT__PRE,DM_STATU,MS,AGE,GNDR,ETHNICIT,BMI,SBP,DBP,...,A1C,ON_METFO,ON_SU,AST_M0,ALT_M0,BILIRRUB,PLATELET,ALBUMIN,TSH,MEAN_FPG
0,0,0,1,1,0,0,0,2,1,1,...,1,0,0,0,0,1,1,1,0,2
1,1,2,0,1,2,1,2,3,0,0,...,0,0,0,0,1,0,1,0,0,1
2,0,0,1,1,2,1,2,2,0,0,...,2,1,1,0,0,0,1,1,0,2
3,0,0,1,1,1,0,0,3,1,1,...,2,1,1,0,0,0,1,1,0,2
4,0,0,1,1,3,1,0,2,1,1,...,1,1,0,0,1,0,1,1,0,2


In [31]:
# convert to OHE and output data as pkl file
ohe = OneHotEncoder(sparse=False, categories='auto')
ohe_no_homa_X = ohe.fit_transform(ohe_df_no_homa)
ohe_no_homa_X.shape

(492, 70)

In [32]:
pkl_save(ohe_no_homa_X, project_root+"train_test_data/ohe_no_homa_X.pkl")

In [33]:
#generate features for ohe
def create_range(l, k):
    n = []
    for idx in range(len(l) + 1):
        if idx == 0:
            n.append(f"{k}_<{l[idx]}")
        elif idx == len(l):
            n.append(f"{k}_>={l[idx-1]}")
        else:
            n.append(f"{k}_{l[idx]-1}_{l[idx]}")
    return n
    
numeric_feature_dict = dict()

for k, v in conf.items():
    if isinstance(v, list):
        numeric_feature_dict[k] = create_range(v, k)
    elif isinstance(v, dict):
        numeric_feature_dict[k] = create_range([f"threshold_{i}" for i in range(len(v['1']))], k) 

In [34]:
ohe_features = []

for col in imputated_df.keys():
    if conf[col] == 'outcome':
        continue
    
    if col in cat_feature_level_dict:
        sub_dict = cat_feature_level_dict[col]
        sorted_sub_dict = sorted(sub_dict.items(), key=operator.itemgetter(1))
        for each in sorted_sub_dict:
            ohe_features.append(f"{col}_{each[0]}")
    else:
        ohe_features.extend(numeric_feature_dict[col])
        
print(ohe_features, len(ohe_features))

['NGT_IFG__IFG', 'NGT_IFG__NGT', 'NGT_IFG__IGT', 'NGT_IFG__DIAB', 'NGT__PRE_PreDM', 'NGT__PRE_NGT', 'NGT__PRE_DIAB', 'DM_STATU_0.0', 'DM_STATU_1.0', 'MS_0.0', 'MS_1.0', 'AGE_<50.0', 'AGE_59.0_60.0', 'AGE_69.0_70.0', 'AGE_>=70.0', 'GNDR_0.0', 'GNDR_1.0', 'ETHNICIT_PI', 'ETHNICIT_Hispanic', 'ETHNICIT_Asian', 'ETHNICIT_Indian', 'ETHNICIT_Caucasian', 'ETHNICIT_AA', 'BMI_<18.5', 'BMI_23.9_24.9', 'BMI_28.9_29.9', 'BMI_>=29.9', 'HOMA_<2.6', 'HOMA_2.8_3.8', 'HOMA_>=3.8', 'SBP_<130.0', 'SBP_>=130.0', 'DBP_<80.0', 'DBP_>=80.0', 'ON_BP_ME_0.0', 'ON_BP_ME_1.0', 'CHOL_<200.0', 'CHOL_>=200.0', 'TG_<150.0', 'TG_>=150.0', 'LDL_<100.0', 'LDL_129.0_130.0', 'LDL_159.0_160.0', 'LDL_>=160.0', 'HDL_<threshold_0', 'HDL_>=threshold_0', 'DYSLIPID_0.0', 'DYSLIPID_1.0', 'ON_STATI_0.0', 'ON_STATI_1.0', 'COMBINAT_0.0', 'COMBINAT_1.0', 'A1C_<5.7', 'A1C_5.5_6.5', 'A1C_>=6.5', 'ON_METFO_0.0', 'ON_METFO_1.0', 'ON_SU_0.0', 'ON_SU_1.0', 'AST_M0_<40.0', 'AST_M0_>=40.0', 'ALT_M0_<threshold_0', 'ALT_M0_>=threshold_0', 'BIL

In [35]:
for k, v in numeric_feature_dict.items():
    print(set(ohe_df[k]), len(v), k)

# two important observations:
# BMI_<18.5 has not data associated 
# HDL, ALT_M0 are male; female different. We assume both male female have same numbers of levels, using threshold (1,2,3) to represent

{0, 1, 2, 3} 4 AGE
{1, 2, 3} 4 BMI
{0, 1, 2} 3 HOMA
{0, 1} 2 SBP
{0, 1} 2 DBP
{0, 1} 2 CHOL
{0, 1} 2 TG
{0, 1, 2, 3} 4 LDL
{0, 1} 2 HDL
{0, 1, 2} 3 A1C
{0, 1} 2 AST_M0
{0, 1} 2 ALT_M0
{0, 1} 2 BILIRRUB
{0, 1} 2 PLATELET
{0, 1} 2 ALBUMIN
{0, 1} 2 TSH
{0, 1, 2} 3 MEAN_FPG


In [36]:
ohe_features.remove('BMI_<18.5') # remove this feature since no data are in this feature
print(len(ohe_features))
pkl_save(ohe_features, project_root+"train_test_data/ohe_features.pkl")

ohe_features_no_homa = [each for each in ohe_features if "HOMA_" not in each]
print(len(ohe_features_no_homa))
pkl_save(ohe_features_no_homa, project_root+"train_test_data/ohe_features_no_homa.pkl")

73
70
