In [1]:
import numpy as np 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_squared_error, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import utils as ut

from sklearn.model_selection import train_test_split
       
import gc, os, re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
pd.set_option('max_rows', 300)
pd.set_option('display.max_columns', 300)
np.random.seed(566)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', -1)
gc.collect()

  import pandas.util.testing as tm


### Load the Data

In [2]:
# filename = 'encode'
# filename = 'encode_simple_imp_skewless'
# filename = 'encode_simple_imp_scale_skewless'
# filename = 'encode_MICE_simple_imp_skewless_upsampling'
filename = 'pri_EDA_all'
# filename = 'pri_EDA_nanrow_90'

path = r"WiDS Datathon/kaggle/input/widsdatathon2020"
path = r"../data/transform/"

train = pd.read_csv(os.path.join(path, "train_"+filename+".csv"), sep=';')
test = pd.read_csv(os.path.join(path, "test_"+filename+".csv"), sep=';')

# Check if test data is ok
test.encounter_id.min()==135000, test.encounter_id.max()==146000

TARGET_COL = "diabetes_mellitus"


print(train.shape, test.shape)

In [4]:
categorical_cols = [
                         'acute_physiology_score',
                         'apache_2_diagnosis',
                         'apache_2_group', 'apache_3_group',
                         'apache_post_operative',
                         'elective_surgery', 'ethnicity',
                         'gender', 'has_fever_first_1h',
                         'has_fever_first_24h', 
                         'has_hyperlactatemia_first_24h', 'has_hypothermia_first_1h',
                         'has_hypothermia_first_24h', 
                         'hospital_admit_source', 'icu_admit_source',
                         'icu_type', 'intubated_apache', 'is_protect_bmi', 
                         'sum_opp_gcs_apache2', 'sum_opp_gcs_apache3',
                         'ventilated_apache', 'weightclass',
                         'is_missing_albumin_apache', 
                         'is_missing_bilirubin_apache', 'is_missing_bun_apache', 
                         'is_missing_creatinine_apache', 'is_missing_fio2_apache', 
                         'is_missing_glucose_apache', 'is_missing_heart_rate_apache',
                         'is_missing_hematocrit_apache', 'is_missing_intubated_apache',
                         'is_missing_map_apache', 'is_missing_paco2_apache', 
                         'is_missing_paco2_for_ph_apache', 'is_missing_pao2_apache', 
                         'is_missing_ph_apache', 'is_missing_resprate_apache', 
                         'is_missing_sodium_apache', 'is_missing_temp_apache', 
                         'is_missing_urineoutput_apache', 'is_missing_ventilated_apache', 
                         'is_missing_wbc_apache',  
                          #'age_group'
#                          'glicoseclass_apache',
#                          'glicoseclass_h1_max', 'glicoseclass_h1_min',
#                          'glicoseclass_d1_max', 'glicoseclass_d1_min',
#                          'glucose_apache_equal_d1_glucose_max',
#                          'd1_glucose_max_equal_h1_glucose_min'
    
                        ]

vif_cols = ['pre_icu_los_days', 'weight', 'map_apache', 'resprate_apache', 'urineoutput_apache', 
            'd1_resprate_max', 'd1_resprate_min', 'd1_spo2_max', 'd1_temp_max', 'd1_temp_min', 
            'h1_heartrate_min', 'h1_resprate_min', 'h1_spo2_max', 'h1_sysbp_max', 'h1_temp_max', 
            'd1_albumin_min', 'd1_bilirubin_min', 'd1_bun_min', 'd1_calcium_max', 'd1_creatinine_min', 
            'd1_glucose_max', 'd1_glucose_min', 'd1_hco3_min', 'd1_hematocrit_min', 'd1_platelets_min', 
            'd1_potassium_min', 'd1_sodium_max', 'd1_wbc_min', 'h1_albumin_min', 'h1_bilirubin_min', 
            'h1_bun_min', 'h1_creatinine_min', 'h1_glucose_min', 'h1_hco3_min', 'h1_hemaglobin_min', 
            'h1_inr_min', 'h1_lactate_min', 'h1_platelets_min', 'h1_potassium_min', 'h1_sodium_min', 
            'd1_arterial_pco2_min', 'd1_arterial_ph_max', 'd1_arterial_po2_min', 'h1_arterial_pco2_min', 
            'h1_arterial_ph_max', 'h1_arterial_po2_min', 'h1_pao2fio2ratio_min', 'num_nan', 'gcs_sum', 
            'h1_sys_minus_diastolic_min', 'd1_sys_minus_diastolic_min', 'd1_heartrate_range',
            'd1_mbp_range', 'd1_spo2_range', 'h1_heartrate_range', 'h1_resprate_range', 'h1_spo2_range', 
            'h1_sysbp_range', 'h1_temp_range', 'd1_albumin_range', 'd1_bilirubin_range', 'd1_bun_range', 
            'd1_calcium_range', 'd1_hco3_range', 'd1_hematocrit_range', 'd1_lactate_range', 
            'd1_platelets_range', 'd1_potassium_range', 'd1_sodium_range', 'h1_bilirubin_range', 
            'h1_calcium_range', 'h1_glucose_range', 'h1_hco3_range', 'h1_inr_range', 'h1_lactate_range', 
            'h1_potassium_range', 'h1_wbc_range', 'd1_arterial_ph_range', 'd1_arterial_po2_range', 
            'd1_pao2fio2ratio_range', 'h1_arterial_pco2_range', 'h1_pao2fio2ratio_range', 
            'h1_sys_minus_diastolic_range', 'd1_sys_minus_diastolic_range', 'pao2_fio2_saps', 'age_score', 'apache_3j_diagnosis_rounded']


categorical_cols =  ['elective_surgery','hospital_id','icu_id',
 'ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 
                          'icu_type','aids','cirrhosis','hepatic_failure','immunosuppression',
 'leukemia','lymphoma','solid_tumor_with_metastasis','elective_surgery','apache_post_operative',
                          'arf_apache','fio2_apache','gcs_unable_apache','gcs_eyes_apache',
 'gcs_motor_apache','gcs_verbal_apache','intubated_apache','ventilated_apache','solid_tumor_with_metastasis',
                          'age_group', 'bmi_cat']


vif_cols = [col for col in vif_cols if col in train.columns]
vif_feature_selection = 0
if vif_feature_selection:
    features = [col for col in train.columns if (col not in [TARGET_COL]) 
                and (col in vif_cols + categorical_cols) ]
else:
    features = [col for col in train.columns if (col not in [TARGET_COL])]

In [5]:
# Reduzir tamanho das features  
train = ut.change_storage_capacity(train, from_bit='64', to_bit='32')
test = ut.change_storage_capacity(test, from_bit='64', to_bit='32')

In [6]:
## Train Test split and remove Target values
X_train = train.drop([TARGET_COL],axis=1)
y_train = train[TARGET_COL]
del train
gc.collect()

In [7]:
alg = XGBClassifier(objective='binary:logistic')

parameters = {'nthread':[4], 
              'objective':['binary:logistic'],
              'learning_rate': [0.05], 
              'max_depth': [2,4,6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [50,100,200],
              'missing':[-999],
              'seed': [1337]}


grid_xgb = GridSearchCV(alg, parameters, n_jobs=3, 
                   cv=StratifiedKFold(n_splits=3, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, refit=True)

grid_xgb.fit(X_train, y_train)
grid_xgb.best_score_, grid_xgb.best_params_

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


KeyboardInterrupt: 

In [None]:
import gc
gc.collect()

In [None]:
# predictions_xgb_gridsearch = grid_xgb.predict(test)
# print(classification_report(y_test,predictions_xgb_gridsearch))

### Submission File

In [None]:
[c for c in X_train.columns if c not in test.columns]

In [None]:
[c for c in test.columns if c not in X_train.columns]

In [None]:
if 'encounter_id' not in test.columns:
    path = r'../data/'
    test2 = pd.read_csv(os.path.join(path, "raw/UnlabeledWiDS2021.csv"))
    test['encounter_id']=test2['encounter_id']

In [None]:
test[TARGET_COL] = grid_xgb.predict_proba(test.drop('encounter_id', axis=1))[:,1]

In [None]:
test[["encounter_id","diabetes_mellitus"]].to_csv("../data/submissions/xgboost_nanrows_120.csv",index=False)
# 0.85124 EDA pri
# 0.84841 EDA pri nanrows_90
# 0.85003 EDA pri 120 all

## Logistic Regression

In [83]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

# predictions_logreb = logmodel.predict(test)
# print(classification_report(y_test,predictions_logreb))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [84]:
try:
    test.drop([TARGET_COL], axis=1, inplace=True)
except:
    pass

In [85]:
test[TARGET_COL] = logmodel.predict_proba(test.drop('encounter_id', axis=1))[:,1]
test[["encounter_id","diabetes_mellitus"]].to_csv("../data/submissions/logreg_nanrows_90.csv",index=False)
# 0.80565 EDA pri
# 0.80575 EDA pri 90
# 0.80612 EDA pri 120 all