In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import lightgbm as lgb


import wandb
from wandb.lightgbm import wandb_callback

pd.set_option("display.max_columns", 181)
pd.set_option("display.min_rows", 200)

In [2]:
data_dictionary = pd.read_csv("DataDictionaryWiDS2021.csv")
unlabeled = pd.read_csv("UnlabeledWiDS2021.csv")
training = pd.read_csv("TrainingWiDS2021.csv")

In [3]:
column_datatype_mapping = dict(zip(data_dictionary['Variable Name'], data_dictionary['Data Type']))

del training['Unnamed: 0']
del unlabeled['Unnamed: 0']

In [4]:
all_data = training.append(unlabeled).drop(['encounter_id', 
                                            'hospital_id', 
                                            'diabetes_mellitus'], axis=1)

all_data.shape

(140391, 177)

In [5]:
cat_cols = ['ethnicity', 'gender', 'hospital_admit_source',
           'icu_admit_source', 'icu_stay_type', 'icu_type',
           'apache_2_diagnosis', 'apache_3j_diagnosis','icu_id']


binary_cols = data_dictionary.loc[data_dictionary['Data Type'] == 'binary','Variable Name'].values
num_cols = data_dictionary.loc[data_dictionary['Data Type'] == 'numeric','Variable Name'].values

    

for col in all_data.columns:
    if col in cat_cols:
        print(col)
        all_data[col] = LabelEncoder().fit_transform(all_data[col].astype('str'))
        all_data[col]= all_data[col].astype('category')        

ethnicity
gender
hospital_admit_source
icu_admit_source
icu_id
icu_stay_type
icu_type
apache_2_diagnosis
apache_3j_diagnosis


In [6]:
all_data.head()

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
0,68.0,22.732803,0,2,1,180.3,4,1,318,0,2,0.541667,0,73.9,2.3,11,326,0,0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0,40.0,,,,,36.0,134.0,39.3,,0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0
1,77.0,27.421875,0,2,0,160.0,4,1,303,0,5,0.927778,0,70.2,,7,244,0,0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,,1,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,,,,,9.0,9.0,8.6,8.6,0.56,0.56,145.0,143.0,27.0,27.0,11.3,11.3,36.9,36.9,1.3,1.3,3.5,3.5,557.0,557.0,4.2,4.2,145.0,145.0,12.7,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,51.0,0,0,0,0,0,0,0
2,25.0,31.952749,0,2,0,172.7,3,0,325,0,5,0.000694,0,95.3,,20,369,0,0,,,,,3.0,6.0,0.0,5.0,,102.0,,0,68.0,,,,,37.0,,36.7,,0,,,,88.0,48.0,88.0,48.0,96.0,68.0,,,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,148.0,105.0,37.0,36.7,,,88.0,58.0,88.0,58.0,96.0,78.0,,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0
3,81.0,22.635548,1,2,0,165.1,8,2,318,0,2,0.000694,0,61.7,,24,71,1,0,,,,0.6,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1,60.0,30.0,30.0,142.0,7.39,4.0,,34.8,,1,8.0,62.0,30.0,48.0,42.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,62.0,44.0,,,100.0,96.0,92.0,71.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,,35.6,34.8,,,,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.6,1.1,,,198.0,43.0,5.0,3.5,,,9.0,8.0,,,,,,,,,,,,,,,11.6,11.6,34.0,34.0,1.6,1.1,,,43.0,43.0,,,,,8.8,8.8,37.0,27.0,7.44,7.34,337.0,102.0,342.5,236.666667,36.0,33.0,7.37,7.34,337.0,265.0,337.0,337.0,0,0,0,0,0,0,0
4,19.0,,0,2,1,188.0,15,0,313,0,5,0.073611,0,,,17,327,0,0,,,,,,,,,,60.0,,0,103.0,,,,,16.0,,36.7,,0,,,,99.0,57.0,99.0,57.0,89.0,60.0,,,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,147.0,120.0,37.2,36.7,,,99.0,68.0,99.0,68.0,89.0,76.0,,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0


In [7]:
df_train = all_data[:len(training)]
df_pred = all_data[len(training):].reset_index(drop=True)
Y = training['diabetes_mellitus']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(df_train, Y, test_size=0.20, random_state=42,shuffle=True )

In [25]:
params = {'learning_rate':0.05,
          'num_leaves':30,
          'n_estimators':2000,
          'metric': 'auc',
          'objective': 'binary',
          'scale_pos_weight': 2}

watchlist = [(X_train, 'train'), (X_val, 'test')]
num_round = 5

# create dataset for lightgbm
lgb_train = lgb.Dataset(df_train, Y)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [26]:
import wandb
from wandb.lightgbm import wandb_callback

wandb.init(project="wids_2021", sync_tensorboard=True)
config = wandb.config

# train

wandb.config.update(params)

# add lightgbm callback
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                valid_names=('val'),
                callbacks=[wandb_callback()],
                early_stopping_rounds=10)

# predict
y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
val_auc,0.864
_step,237.0
_runtime,19.0
_timestamp,1612041690.0


0,1
val_auc,▁▄▄▅▅▆▆▆▇▇▇▇▇▇▇▇████████████████████████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████


[34m[1mwandb[0m: wandb version 0.10.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




[LightGBM] [Info] Number of positive: 28151, number of negative: 102006
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25953
[LightGBM] [Info] Number of data points in the train set: 130157, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216285 -> initscore=-1.287449
[LightGBM] [Info] Start training from score -1.287449




[1]	val's auc: 0.819314
Training until validation scores don't improve for 10 rounds
[2]	val's auc: 0.827183
[3]	val's auc: 0.83053
[4]	val's auc: 0.835106
[5]	val's auc: 0.840305
[6]	val's auc: 0.843415
[7]	val's auc: 0.845263
[8]	val's auc: 0.846457
[9]	val's auc: 0.847991
[10]	val's auc: 0.849511
[11]	val's auc: 0.850359
[12]	val's auc: 0.851666
[13]	val's auc: 0.8527
[14]	val's auc: 0.854
[15]	val's auc: 0.854761
[16]	val's auc: 0.855723
[17]	val's auc: 0.85655
[18]	val's auc: 0.857514
[19]	val's auc: 0.858405
[20]	val's auc: 0.859262
[21]	val's auc: 0.860458
[22]	val's auc: 0.861456
[23]	val's auc: 0.862378
[24]	val's auc: 0.86335
[25]	val's auc: 0.864198
[26]	val's auc: 0.864899
[27]	val's auc: 0.865588
[28]	val's auc: 0.866464
[29]	val's auc: 0.867382
[30]	val's auc: 0.867999
[31]	val's auc: 0.868536
[32]	val's auc: 0.869217
[33]	val's auc: 0.869773
[34]	val's auc: 0.870522
[35]	val's auc: 0.871378
[36]	val's auc: 0.871933
[37]	val's auc: 0.872548
[38]	val's auc: 0.873188
[39]	v

In [27]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_val, y_pred))

0.9987500464065868


In [28]:
preds_proba = gbm.predict(X_val)
metrics.roc_auc_score(y_val, preds_proba)

0.9987500464065868

In [29]:
submittion = pd.DataFrame([unlabeled.encounter_id,gbm.predict(df_pred)]).T
submittion.encounter_id = submittion.encounter_id.astype('int32')
submittion.set_index('encounter_id',inplace=True)
submittion.columns = ['diabetes_mellitus']
submittion.to_csv('submissions/SolutionWiDS2021_LightGBM.csv')