In [None]:
import pandas as pd
import numpy as np
from scipy import special
import re
import pickle

In [None]:
train = pd.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
test = pd.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)
data_dict = pd.read_csv("../data/DataDictionaryWiDS2021.csv")

n_train = train.shape[0]
n_test = test.shape[0]

train_test = pd.concat([train, test], axis=0)

## Exporting columns of exams

In [None]:
features_apache = data_dict.loc[data_dict["Category"] == "APACHE covariate", "Variable Name"].tolist()
features_vitals = data_dict.loc[data_dict["Category"] == "vitals", "Variable Name"].tolist()
features_labs = data_dict.loc[data_dict["Category"] == "labs", "Variable Name"].tolist()
features_labs_blood_gas = data_dict.loc[data_dict["Category"] == "labs blood gas", "Variable Name"].tolist()

features_all_exams = np.concatenate([features_apache, features_vitals, features_labs, features_labs_blood_gas])

# with open("../data/exam_cols.pkl", "wb") as f:
#     pickle.dump(features_all_exams, f)

## Correction age 0

In [None]:
train.loc[train.age == 0, 'age'] = np.nan

## Creating new features

### Range e normalized values of min_max features

In [None]:
features_minmax = [re.sub("_max$", "", x) for x in train_test.columns.tolist() if bool(re.search("_max$", x))]

In [None]:
for col in features_minmax:
    train_test[col + "_range"] = train_test.loc[:, col + "_max"] - train_test.loc[:, col + "_min"]
    train_test[col + "_norm"] = train_test[col + "_range"] / train_test.loc[:, col + "_max"]

### Convert apache to string and split in main and sub diagnostics

In [None]:
train_test['apache_3j_diagnosis_main'] = train_test['apache_3j_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]
train_test['apache_2_diagnosis_main'] = train_test['apache_2_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]

train_test['apache_3j_diagnosis_sub'] = np.where(train_test['apache_3j_diagnosis'].isna(), np.nan, train_test['apache_3j_diagnosis'].astype('str').str.split('.',n=1,expand=True)[1])
train_test['apache_2_diagnosis_sub'] = np.where(train_test['apache_2_diagnosis'].isna(), np.nan, train_test['apache_2_diagnosis'].apply(lambda x : x % 10))

In [None]:
train_test['gcs_sum'] = train_test['gcs_eyes_apache'] + train_test['gcs_motor_apache'] + train_test['gcs_verbal_apache']
train_test['gcs_sum'] = train_test['gcs_sum'].fillna(0)

train_test['gcs_sum_type'] = train_test.gcs_sum.fillna(0).apply(lambda x: 2.5 * (round(int(x)/2.5))).divide(2.5)

### Variables based on counting of icu_id and age

In [None]:
agg = train_test.loc[:, 'icu_id'].value_counts().to_dict()
train_test['icu_id_counts'] = np.log1p(train_test['icu_id'].map(agg))

agg = train_test.loc[:, "age"].value_counts().to_dict()
train_test['age_counts'] = np.log1p(train_test['age'].map(agg))

### Relation between age, bmi and weight

In [None]:
train_test['age_bmi'] = train_test['age'] / train_test['bmi']
train_test['weight_age'] = train_test['weight'] / train_test['age']

### Linear transformation of pre_icu_los_days by logistic function

In [None]:
# special.explicit: logistic function
train_test['pre_icu_los_days'] = train_test['pre_icu_los_days'].apply(lambda x:special.expit(x))

### Difference between given bmi and calculated bmi

In [None]:
train_test["diff_bmi"] = train_test.loc[:, 'bmi']
train_test['bmi'] = train_test['weight'] / (train_test['height'] / 100)**2
train_test["diff_bmi"] = train_test["diff_bmi"] - train_test['bmi']

In [None]:
# d_cols = [c for c in train_test.columns if(c.startswith("d1"))]
# h_cols = [c for c in train_test.columns if(c.startswith("h1"))]
# train_test["dailyLabs_row_nan_count"] = train_test[d_cols].isna().sum(axis=1)
# train_test["hourlyLabs_row_nan_count"] = train_test[h_cols].isna().sum(axis=1)
# train_test["diff_labTestsRun_daily_hourly"] = train_test["dailyLabs_row_nan_count"] - train_test["hourlyLabs_row_nan_count"]
# test["dailyLabs_row_nan_count"] = test[d_cols].isna().sum(axis=1)
# test["hourlyLabs_row_nan_count"] = test[h_cols].isna().sum(axis=1)
# test["diff_labTestsRun_daily_hourly"] = test["dailyLabs_row_nan_count"] - test["hourlyLabs_row_nan_count"]

In [None]:
lab_col = [c for c in train.columns if((c.startswith("h1")) | (c.startswith("d1")))]
lab_col_names = list(set(list(map(lambda i: i[ 3 : -4], lab_col))))

print("len lab_col",len(lab_col))
print("len lab_col_names",len(lab_col_names))
print("lab_col_names\n",lab_col_names)

In [None]:
# first_h = []
# for v in lab_col_names:
#     first_h.append(v+"_started_after_firstHour")
#     colsx = [x for x in test.columns if v in x]
#     train_test[v+"_nans"] = train_test.loc[:, colsx].isna().sum(axis=1)
#     test[v+"_nans"] = test.loc[:, colsx].isna().sum(axis=1)
#     train_test[v+"_d1_value_range"] = train_test[f"d1_{v}_max"].subtract(train_test[f"d1_{v}_min"])    
#     train_test[v+"_h1_value_range"] = train_test[f"h1_{v}_max"].subtract(train_test[f"h1_{v}_min"])
#     train_test[v+"_d1_h1_max_eq"] = (train_test[f"d1_{v}_max"]== train_test[f"h1_{v}_max"]).astype(np.int8)
#     train_test[v+"_d1_h1_min_eq"] = (train_test[f"d1_{v}_min"]== train_test[f"h1_{v}_min"]).astype(np.int8)
#     train_test[v+"_d1_zero_range"] = (train_test[v+"_d1_value_range"] == 0).astype(np.int8)
#     train_test[v+"_h1_zero_range"] =(train_test[v+"_h1_value_range"] == 0).astype(np.int8)
#     train_test[v+"_tot_change_value_range_normed"] = abs((train_test[v+"_d1_value_range"].div(train_test[v+"_h1_value_range"])))#.div(df[f"d1_{v}_max"]))
#     train_test[v+"_started_after_firstHour"] = ((train_test[f"h1_{v}_max"].isna()) & (train_test[f"h1_{v}_min"].isna())) & (~train_test[f"d1_{v}_max"].isna())
#     train_test[v+"_day_more_extreme"] = ((train_test[f"d1_{v}_max"]>train_test[f"h1_{v}_max"]) | (train_test[f"d1_{v}_min"]<train_test[f"h1_{v}_min"]))
#     train_test[v+"_day_more_extreme"].fillna(False)

# train_test["total_Tests_started_After_firstHour"] = train_test[first_h].sum(axis=1)

### Create categories for height, weight, age and bmi

In [None]:
train_test['height_type'] = train_test.height.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
train_test['weight_type'] = train_test.weight.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
train_test['age_type'] = train_test.age.fillna(0).apply(lambda x: 10 * (round(int(x)/10)))
train_test['bmi_type'] = train_test.bmi.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))

### Create differences in exams grouped by diagnostics and profile

In [None]:
IDENTIFYING_COLS = ['age_type', 'height_type',  'ethnicity', 'gender', 'bmi_type'] 
train_test['profile'] = train_test[IDENTIFYING_COLS].apply(lambda x: hash(tuple(x)), axis = 1)

print(f'Number of unique Profiles : {train_test["profile"].nunique()}')

In [None]:
groupers = ['apache_3j_diagnosis', 'profile']

for g in groupers:      
    for v in lab_col_names:
        temp = train_test[[f"d1_{v}_max",g]].groupby(g)[f"d1_{v}_max"].mean().to_dict()
        train_test[f'mean_diff_d1_{v}_{g}_max'] = train_test[f"d1_{v}_max"]-train_test[g].map(temp)

        temp = train_test[[f"d1_{v}_min",g]].groupby(g)[f"d1_{v}_min"].mean().to_dict()   
        train_test[f'mean_diff_d1_{v}_{g}_min'] = train_test[f"d1_{v}_min"]-train_test[g].map(temp)
        
        temp = train_test[[f"h1_{v}_max",g]].groupby(g)[f"h1_{v}_max"].mean().to_dict()   
        train_test[f'mean_diff_h1_{v}_{g}_max'] = train_test[f"h1_{v}_max"]-train_test[g].map(temp)
        
        temp = train_test[[f"h1_{v}_min",g]].groupby(g)[f"h1_{v}_min"].mean().to_dict()   
        train_test[f'mean_diff_h1_{v}_{g}_min'] = train_test[f"h1_{v}_min"]-train_test[g].map(temp)

In [None]:
# train_test['diasbp_indicator'] = (
# (train_test['d1_diasbp_invasive_max'] == train_test['d1_diasbp_max']) & (train_test['d1_diasbp_noninvasive_max']==train_test['d1_diasbp_invasive_max'])|
# (train_test['d1_diasbp_invasive_min'] == train_test['d1_diasbp_min']) & (train_test['d1_diasbp_noninvasive_min']==train_test['d1_diasbp_invasive_min'])|
# (train_test['h1_diasbp_invasive_max'] == train_test['h1_diasbp_max']) & (train_test['h1_diasbp_noninvasive_max']==train_test['h1_diasbp_invasive_max'])|
# (train_test['h1_diasbp_invasive_min'] == train_test['h1_diasbp_min']) & (train_test['h1_diasbp_noninvasive_min']==train_test['h1_diasbp_invasive_min'])
# ).astype(np.int8)


# train_test['mbp_indicator'] = (
# (train_test['d1_mbp_invasive_max'] == train_test['d1_mbp_max']) & (train_test['d1_mbp_noninvasive_max']==train_test['d1_mbp_invasive_max'])|
# (train_test['d1_mbp_invasive_min'] == train_test['d1_mbp_min']) & (train_test['d1_mbp_noninvasive_min']==train_test['d1_mbp_invasive_min'])|
# (train_test['h1_mbp_invasive_max'] == train_test['h1_mbp_max']) & (train_test['h1_mbp_noninvasive_max']==train_test['h1_mbp_invasive_max'])|
# (train_test['h1_mbp_invasive_min'] == train_test['h1_mbp_min']) & (train_test['h1_mbp_noninvasive_min']==train_test['h1_mbp_invasive_min'])
# ).astype(np.int8)

# train_test['sysbp_indicator'] = (
# (train_test['d1_sysbp_invasive_max'] == train_test['d1_sysbp_max']) & (train_test['d1_sysbp_noninvasive_max']==train_test['d1_sysbp_invasive_max'])|
# (train_test['d1_sysbp_invasive_min'] == train_test['d1_sysbp_min']) & (train_test['d1_sysbp_noninvasive_min']==train_test['d1_sysbp_invasive_min'])|
#  (train_test['h1_sysbp_invasive_max'] == train_test['h1_sysbp_max']) & (train_test['h1_sysbp_noninvasive_max']==train_test['h1_sysbp_invasive_max'])|
# (train_test['h1_sysbp_invasive_min'] == train_test['h1_sysbp_min']) & (train_test['h1_sysbp_noninvasive_min']==train_test['h1_sysbp_invasive_min'])   
# ).astype(np.int8)

# train_test['d1_mbp_invnoninv_max_diff'] = train_test['d1_mbp_invasive_max'] - train_test['d1_mbp_noninvasive_max']
# train_test['h1_mbp_invnoninv_max_diff'] = train_test['h1_mbp_invasive_max'] - train_test['h1_mbp_noninvasive_max']
# train_test['d1_mbp_invnoninv_min_diff'] = train_test['d1_mbp_invasive_min'] - train_test['d1_mbp_noninvasive_min']
# train_test['h1_mbp_invnoninv_min_diff'] = train_test['h1_mbp_invasive_min'] - train_test['h1_mbp_noninvasive_min']
# train_test['d1_diasbp_invnoninv_max_diff'] = train_test['d1_diasbp_invasive_max'] - train_test['d1_diasbp_noninvasive_max']
# train_test['h1_diasbp_invnoninv_max_diff'] = train_test['h1_diasbp_invasive_max'] - train_test['h1_diasbp_noninvasive_max']
# train_test['d1_diasbp_invnoninv_min_diff'] = train_test['d1_diasbp_invasive_min'] - train_test['d1_diasbp_noninvasive_min']
# train_test['h1_diasbp_invnoninv_min_diff'] = train_test['h1_diasbp_invasive_min'] - train_test['h1_diasbp_noninvasive_min']
# train_test['d1_sysbp_invnoninv_max_diff'] = train_test['d1_sysbp_invasive_max'] - train_test['d1_sysbp_noninvasive_max']
# train_test['h1_sysbp_invnoninv_max_diff'] = train_test['h1_sysbp_invasive_max'] - train_test['h1_sysbp_noninvasive_max']
# train_test['d1_sysbp_invnoninv_min_diff'] = train_test['d1_sysbp_invasive_min'] - train_test['d1_sysbp_noninvasive_min']
# train_test['h1_sysbp_invnoninv_min_diff'] = train_test['h1_sysbp_invasive_min'] - train_test['h1_sysbp_noninvasive_min']

# for v in ['albumin','bilirubin','bun','glucose','hematocrit','pao2fio2ratio','arterial_ph','resprate','sodium','temp','wbc','creatinine']:
#     train_test[f'{v}_indicator'] = (((train_test[f'{v}_apache']==train_test[f'd1_{v}_max']) & (train_test[f'd1_{v}_max']==train_test[f'h1_{v}_max'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'd1_{v}_max']) & (train_test[f'd1_{v}_max']==train_test[f'd1_{v}_min'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'd1_{v}_max']) & (train_test[f'd1_{v}_max']==train_test[f'h1_{v}_min'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'h1_{v}_max']) & (train_test[f'h1_{v}_max']==train_test[f'd1_{v}_max'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'h1_{v}_max']) & (train_test[f'h1_{v}_max']==train_test[f'h1_{v}_min'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'h1_{v}_max']) & (train_test[f'h1_{v}_max']==train_test[f'd1_{v}_min'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'd1_{v}_min']) & (train_test[f'd1_{v}_min']==train_test[f'd1_{v}_max'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'd1_{v}_min']) & (train_test[f'd1_{v}_min']==train_test[f'h1_{v}_min'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'd1_{v}_min']) & (train_test[f'd1_{v}_min']==train_test[f'h1_{v}_max'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'h1_{v}_min']) & (train_test[f'h1_{v}_min']==train_test[f'h1_{v}_max'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'h1_{v}_min']) & (train_test[f'h1_{v}_min']==train_test[f'd1_{v}_min'])) |
#                  ((train_test[f'{v}_apache']==train_test[f'h1_{v}_min']) & (train_test[f'h1_{v}_min']==train_test[f'd1_{v}_max'])) 
#                 ).astype(np.int8)

In [None]:
# more_extreme_cols = [c for c in train_test.columns if(c.endswith("_day_more_extreme"))]
# train_test["total_day_more_extreme"] = train_test[more_extreme_cols].sum(axis=1)

# train_test["d1_resprate_div_mbp_min"] = train_test["d1_resprate_min"].div(train_test["d1_mbp_min"])
# train_test["d1_resprate_div_sysbp_min"] = train_test["d1_resprate_min"].div(train_test["d1_sysbp_min"])
# train_test["d1_lactate_min_div_diasbp_min"] = train_test["d1_lactate_min"].div(train_test["d1_diasbp_min"])
# train_test["d1_heartrate_min_div_d1_sysbp_min"] = train_test["d1_heartrate_min"].div(train_test["d1_sysbp_min"])
# train_test["d1_hco3_div"]= train_test["d1_hco3_max"].div(train_test["d1_hco3_min"])
# train_test["d1_resprate_times_resprate"] = train_test["d1_resprate_min"].multiply(train_test["d1_resprate_max"])
# train_test["left_average_spo2"] = (2*train_test["d1_spo2_max"] + train_test["d1_spo2_min"])/3

# train_test["total_chronic"] = train_test[["aids","cirrhosis", 'hepatic_failure']].sum(axis=1)
# train_test["total_cancer_immuno"] = train_test[[ 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']].sum(axis=1)

# train_test["has_complicator"] = train_test[["aids","cirrhosis", 'hepatic_failure',
#                             'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']].max(axis=1)

## Spliting data and saving

In [None]:
train_new = train_test.head(n_train)
test_new = train_test.tail(n_test).drop("diabetes_mellitus", axis=1)

data_dict = {"train": train_new, "test": test_new}

with open("../data/data_fe.pkl", "wb") as f:
    pickle.dump(data_dict, f)