### Importing all the useful libraries

In [1]:
import pandas as pd
import numpy as np
import gc
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from ml_metrics import rmsle
import scikitplot as skplt
from scikitplot.estimators import plot_learning_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pickle
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

%matplotlib inline

### Loading the data

In [742]:
train = pd.read_hdf('../input/diabetic_train.h5')
test = pd.read_hdf('../input/diabetic_test.h5')
df = pd.concat([train, test])

### Data preprocessing

In [2]:
# Replace all '?', representing unknown 
df = df.replace('?',np.nan)

In [743]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [745]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'id'],
      dtype='object')

In [746]:
cols_num = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 
'number_emergency', 'number_inpatient', 'number_diagnoses']

In [747]:
cols_cat = ['race', 'gender', 'max_glu_serum', 'A1Cresult', 
            'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
            'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
            'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
            'tolazamide','insulin',
            'glyburide-metformin', 'glipizide-metformin',
            'glimepiride-pioglitazone', 'metformin-rosiglitazone',
            'metformin-pioglitazone', 'change', 'diabetesMed', 'payer_code']

In [748]:
df['race'] = df['race'].fillna('UNK')
df['payer_code'] = df['payer_code'].fillna('UNK')
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')

In [749]:
top_10 = ['UNK','InternalMedicine','Family/GeneralPractice','Emergency/Trauma','Cardiology',
          'Surgery-General','Orthopedics','Nephrology','Orthopedics-Reconstructive','Radiologist']

In [750]:
df['med_spec'] = df['medical_specialty'].copy()
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,id,med_spec
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,UNK,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,0,0,Pediatrics-Endocrinology
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,UNK,UNK,11,5,13,2,0,1,648.0,250.0,V27,6,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,True,0,2,UNK
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,UNK,UNK,44,1,16,0,0,0,8.0,250.43,403,7,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,True,True,0,3,UNK
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,UNK,UNK,51,0,8,0,0,0,197.0,157.0,250,5,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,0,4,UNK
8,12522,48330783,Caucasian,Female,[80-90),,2,1,4,13,UNK,UNK,68,2,28,0,0,0,398.0,427.0,38,8,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,0,8,UNK


In [751]:
df.loc[~df.med_spec.isin(top_10), 'med_spec'] = "Other"

In [752]:
df['death'] = df.discharge_disposition_id.isin([11,13,14,19,20,21]).astype(int)
np.unique(df['death'])

array([0, 1])

In [753]:
cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

In [754]:
df[cols_cat_num] = df[cols_cat_num].astype('str')

In [755]:
df_cat = pd.get_dummies(df[cols_cat + cols_cat_num + ['med_spec']], drop_first=True)

In [756]:
df = pd.concat([df,df_cat], axis=1)

In [757]:
cols_all_cat = list(df_cat.columns)

In [758]:
age_id = {
    '[0-10)':0,
    '[10-20)':1,
    '[20-30)':2,
    '[30-40)':3,
    '[40-50)':4,
    '[50-60)':5,
    '[60-70)':6,
    '[70-80)':7,
    '[80-90)':8,
    '[90-100)':9     
}

df['age_group'] = df.age.replace(age_id)

In [759]:
df['has_weight'] = df['weight'].notnull().astype(int)

In [760]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66221 entries, 0 to 101764
Columns: 166 entries, encounter_id to has_weight
dtypes: bool(4), int64(58), object(16), uint8(88)
memory usage: 43.7+ MB


In [761]:
cols_extra = ['age_group', 'has_weight']

In [762]:
cols_death = ['death']

In [763]:
col2use = cols_num + cols_all_cat + cols_extra + cols_death

In [764]:
df[col2use].head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,metformin.1,repaglinide,repaglinide.1,nateglinide,nateglinide.1,chlorpropamide,chlorpropamide.1,glimepiride,glimepiride.1,acetohexamide,acetohexamide.1,glipizide,glipizide.1,glyburide,glyburide.1,tolbutamide,tolbutamide.1,pioglitazone,pioglitazone.1,rosiglitazone,rosiglitazone.1,acarbose,acarbose.1,miglitol,miglitol.1,troglitazone,troglitazone.1,tolazamide,tolazamide.1,insulin,insulin.1,glyburide-metformin,glyburide-metformin.1,glipizide-metformin,glipizide-metformin.1,glimepiride-pioglitazone,glimepiride-pioglitazone.1,metformin-rosiglitazone,metformin-rosiglitazone.1,metformin-pioglitazone,metformin-pioglitazone.1,change,change.1,diabetesMed,diabetesMed.1,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_UNK,gender_Male,gender_Unknown/Invalid,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,payer_code_CH,payer_code_CM,payer_code_CP,payer_code_DM,payer_code_FR,payer_code_HM,payer_code_MC,payer_code_MD,payer_code_MP,payer_code_OG,payer_code_OT,payer_code_PO,payer_code_SI,payer_code_SP,payer_code_UN,payer_code_UNK,payer_code_WC,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_2,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_2,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,med_spec_Emergency/Trauma,med_spec_Family/GeneralPractice,med_spec_InternalMedicine,med_spec_Nephrology,med_spec_Orthopedics,med_spec_Orthopedics-Reconstructive,med_spec_Other,med_spec_Radiologist,med_spec_Surgery-General,med_spec_UNK,age_group,has_weight,death
0,1,41,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,11,5,13,2,0,1,6,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,True,True,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0
3,2,44,1,16,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0
4,1,51,0,8,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,4,0,0
8,13,68,2,28,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,0,0


In [765]:
train = df[df.readmitted != '']
test = df[df.readmitted == '']

In [766]:
print(train.shape)
print(test.shape)

(33051, 166)
(33170, 166)


In [767]:
train['OUTPUT_LABEL'] = (train.readmitted == 100).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [768]:
df_data = train[col2use + ['readmitted']]
df_data.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,metformin.1,repaglinide,repaglinide.1,nateglinide,nateglinide.1,chlorpropamide,chlorpropamide.1,glimepiride,glimepiride.1,acetohexamide,acetohexamide.1,glipizide,glipizide.1,glyburide,glyburide.1,tolbutamide,tolbutamide.1,pioglitazone,pioglitazone.1,rosiglitazone,rosiglitazone.1,acarbose,acarbose.1,miglitol,miglitol.1,troglitazone,troglitazone.1,tolazamide,tolazamide.1,insulin,insulin.1,glyburide-metformin,glyburide-metformin.1,glipizide-metformin,glipizide-metformin.1,glimepiride-pioglitazone,glimepiride-pioglitazone.1,metformin-rosiglitazone,metformin-rosiglitazone.1,metformin-pioglitazone,metformin-pioglitazone.1,change,change.1,diabetesMed,diabetesMed.1,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_UNK,gender_Male,gender_Unknown/Invalid,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,payer_code_CH,payer_code_CM,payer_code_CP,payer_code_DM,payer_code_FR,payer_code_HM,payer_code_MC,payer_code_MD,payer_code_MP,payer_code_OG,payer_code_OT,payer_code_PO,payer_code_SI,payer_code_SP,payer_code_UN,payer_code_UNK,payer_code_WC,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_2,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_2,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,med_spec_Emergency/Trauma,med_spec_Family/GeneralPractice,med_spec_InternalMedicine,med_spec_Nephrology,med_spec_Orthopedics,med_spec_Orthopedics-Reconstructive,med_spec_Other,med_spec_Radiologist,med_spec_Surgery-General,med_spec_UNK,age_group,has_weight,death,readmitted
0,1,41,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,2,11,5,13,2,0,1,6,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,True,True,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0
3,2,44,1,16,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0
4,1,51,0,8,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,4,0,0,0
8,13,68,2,28,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,0,0,0


In [769]:
df_data = df_data.sample(n= len(df_data), random_state=42)
df_data = df_data.reset_index(drop=True)

In [770]:
df_valid_test = df_data.sample(frac=0.30,random_state=42)
len(df_valid_test)/len(df_data)

0.29999092311881637

In [771]:
df_test = df_valid_test.sample(frac=0.50,random_state=42)
df_valid = df_valid_test.drop(df_test.index)

In [772]:
df_train_all = df_data.drop(df_valid_test.index)

In [773]:
print(calc_prevalence(df_test['readmitted'].values))
print(calc_prevalence(df_valid['readmitted'].values))
print(calc_prevalence(df_train_all['readmitted'].values))

17.365873336022588
17.207988702844464
17.31932918395574


In [774]:
# rows_pos = df_train_all.readmitted == 100
# df_train_pos = df_train_all.loc[rows_pos]
# df_train_neg = df_train_all.loc[~rows_pos]
df_train = df_train_all

In [775]:
# df_train_neg.shape

In [776]:
# df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state=42)], axis=0)
# df_train = df_train.sample(n = len(df_train), random_state=42).reset_index(drop=True)

In [777]:
# df_train.corr().nlargest(3, 'death')
# print(df_train.shape)
# df_train = df_train.loc[:,~df_train.columns.duplicated()]
# print(df_train.shape)
# df_train_all = df_train_all.loc[:,~df_train_all.columns.duplicated()]
# print(df_train_all.shape)
# df_test = df_test.loc[:,~df_test.columns.duplicated()]
# df_valid = df_valid.loc[:,~df_valid.columns.duplicated()]

# df_train.head()
# name = df_train.columns[-1]
# df_train_all.corr().nlargest(30, name)
# df_train['readmitted_cat'] = (df_train['readmitted'] > 0).astype(int)
# df_train_all['readmitted_cat'] = (df_train['readmitted'] > 0).astype(int)
# df_train.info()
# df_train.corr()

In [778]:
# print(df_train.head())
# asia = df_train.corr().nlargest(30, 'readmitted_cat')
feats_special = ['number_inpatient','number_diagnoses', 'number_emergency', 'time_in_hospital', 
'discharge_disposition_id_3', 'discharge_disposition_id_22', 'insulin', 'num_medications', 'number_outpatient',
'admission_source_id_7', 'diabetesMed', 'age_group', 'med_spec_Nephrology', 'discharge_disposition_id_6',
'discharge_disposition_id_2', 'discharge_disposition_id_5', 'med_spec_UNK', 'discharge_disposition_id_2',
'discharge_disposition_id_28', 'payer_code_MC', 'A1Cresult_None', 'race_Caucasian', 'has_weight',
'admission_type_id_6', 'max_glu_serum_>300', 'glipizide', 'repaglinide']
# asia

In [779]:
# calc_prevalence(df_train['OUTPUT_LABEL'].values)

In [780]:
# col2use
X_train = df_train[feats_special].values
X_train_all = df_train_all[feats_special].values
X_valid = df_valid[feats_special].values
X_test = df_test[feats_special].values

y_train = df_train['readmitted'].values.astype(np.int8)
y_valid = df_valid['readmitted'].values.astype(np.int8)
y_test = df_test['readmitted'].values.astype(np.int8)

In [781]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
print(X_train_all.shape)


(23136, 31)
(4957, 31)
(4958, 31)
(23136, 31)


In [782]:
# scaler = StandardScaler()
# scaler.fit(X_train_all)
# scalerfile = 'scaler.sav'
# pickle.dump(scaler, open(scalerfile, 'wb'))

In [783]:
# scaler = pickle.load(open(scalerfile, 'rb'))

In [784]:
# X_train_tf = scaler.transform(X_train)
# X_valid_tf = scaler.transform(X_valid)

In [785]:
# df_train[df_train.columns[1:]].corr()['id'][:-1]

In [786]:
model = xgb.XGBClassifier()
cv = StratifiedKFold(n_splits=10)
scores = []
for train_idx, test_idx in cv.split(X_train, y_train):
    X_fold_train, X_fold_test = X_train[train_idx], X_train[test_idx]
    y_fold_train, y_fold_test = y_train[train_idx], y_train[test_idx]
    
    y_log = np.log2(y_fold_train + 3)
    model.fit(X_fold_train, y_log)
    y_pred_log = model.predict(X_fold_test) 
    y_pred = np.exp2(y_pred_log) - 3
    y_pred[ y_pred < 0 ] = 0 #reset negative result
    
    score = rmsle(y_fold_test, y_pred)
    scores.append( score )
print('score: ', np.mean(scores), 'std: ', np.std(scores))

score:  1.88 std:  0.01733


In [787]:
y_pred2 = model.predict(X_test) 
score2 = rmsle(y_pred2.astype(np.int8), y_test.astype(np.int8))
score2

1.726

In [788]:
ultimate_test = test[feats_special]
ultimate_test = ultimate_test.loc[:,~ultimate_test.columns.duplicated()]
test[feats_special].head()
test[feats_special].shape
df_train[feats_special].shape

(23136, 31)

In [2]:
# readmitted = model.predict(ultimate_test_X)
np.unique(readmitted)

NameError: name 'np' is not defined

In [None]:
readmitted = readmitted

In [790]:
test = test.assign(readmitted = readmitted)

In [791]:
test['readmitted'].head()

6     1.584961
9     1.584961
13    1.584961
15    1.584961
16    1.584961
Name: readmitted, dtype: float16

In [792]:
test[ ['id', 'readmitted'] ].to_csv('../output/asda.csv', index=False)

In [582]:
from sklearn.model_selection import train_test_split

from functools import partial
from hyperopt import hp
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

XX_train, XX_test, yy_train, yy_test = train_test_split(X_train, y_train.astype(np.int8), test_size=0.3)

def objective(space):
    
    xgb_params = {
        'max_depth': int(space['max_depth']),
        'colsample_bytree': space['colsample_bytree'],
        'learning_rate': space['learning_rate'],
        'subsample': space['subsample'],
        'random_state': int(space['random_state']),
        'min_child_weight': int(space['min_child_weight']),
        'reg_alpha': space['reg_alpha'],
        'reg_lambda': space['reg_lambda'],
        'n_estimators': 100,
        'objective': 'reg:squarederror'
    }
    
    model = xgb.XGBRegressor(**xgb_params)
#     yy_log = np.log2(yy_train + 3)
    model.fit(XX_train, yy_train)
    yy_pred = model.predict(XX_test)
    yy_pred[ yy_pred < 0 ] = 0
#     yy_pred = np.exp2(y_pred_log) - 3

    
    score = rmsle(yy_test, yy_pred)
    print("Gini {:.3f} params {}".format(score, space))
    return score
    
space ={
    'max_depth': hp.quniform ('x_max_depth', 5, 20, 1),
    'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.8, 1.),
    'learning_rate': hp.uniform ('x_learning_rate', 0.05, 0.2),
    'subsample': hp.uniform ('x_subsample', 0.7, 1.),
    'random_state': hp.quniform ('x_random_state', 0, 10000, 50),
    'min_child_weight': hp.quniform ('x_min_child_weight', 1, 10, 1),
    'reg_alpha': hp.loguniform ('x_reg_alpha', 0., 1.),
    'reg_lambda': hp.uniform ('x_reg_lambda', 0.7, 1.),
}


trials = Trials()
best_params = fmin(fn=objective,
            space=space,
            algo=partial(tpe.suggest, n_startup_jobs=1),
            max_evals=30,
            trials=trials)

print("The best params: ", best_params)

Gini 2.612 params {'colsample_bytree': 0.9921321304471834, 'learning_rate': 0.05020991486409177, 'max_depth': 18.0, 'min_child_weight': 6.0, 'random_state': 9850.0, 'reg_alpha': 2.257744992891465, 'reg_lambda': 0.9922128072135987, 'subsample': 0.7295801823833267}
Gini 2.613 params {'colsample_bytree': 0.9984340825412955, 'learning_rate': 0.05302953045031833, 'max_depth': 19.0, 'min_child_weight': 6.0, 'random_state': 9950.0, 'reg_alpha': 2.4436173239701366, 'reg_lambda': 0.9937818566163987, 'subsample': 0.7118485765214664}
Gini 2.629 params {'colsample_bytree': 0.8585922815928033, 'learning_rate': 0.058700841773904694, 'max_depth': 8.0, 'min_child_weight': 5.0, 'random_state': 2500.0, 'reg_alpha': 1.1761994417808743, 'reg_lambda': 0.7847029241394414, 'subsample': 0.9325325482679787}
Gini 2.653 params {'colsample_bytree': 0.9766623826083257, 'learning_rate': 0.1968027684081407, 'max_depth': 20.0, 'min_child_weight': 10.0, 'random_state': 9250.0, 'reg_alpha': 2.5530955566065536, 'reg_lam