### Importing all the useful libraries

In [1]:
import pandas as pd
import numpy as np
import gc
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from ml_metrics import rmsle
import scikitplot as skplt
from scikitplot.estimators import plot_learning_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pickle
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

%matplotlib inline

### Loading the data

In [2]:
train = pd.read_hdf('../input/diabetic_train.h5')
test = pd.read_hdf('../input/diabetic_test.h5')
df = pd.concat([train, test])

### Data preprocessing

In [3]:
# Replace all '?', representing unknown, with NaN
df = df.replace('?',np.nan)

In [4]:
# Take a look at the available features
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'id'],
      dtype='object')

In [5]:
# Numeric features
cols_num = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 
'number_emergency', 'number_inpatient', 'number_diagnoses']

In [6]:
# Categorical features
cols_cat = ['race', 'gender', 'max_glu_serum', 'A1Cresult', 
            'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
            'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
            'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
            'tolazamide','insulin',
            'glyburide-metformin', 'glipizide-metformin',
            'glimepiride-pioglitazone', 'metformin-rosiglitazone',
            'metformin-pioglitazone', 'change', 'diabetesMed', 'payer_code']

In [7]:
# Fill in missing values
df['race'] = df['race'].fillna('UNK')
df['payer_code'] = df['payer_code'].fillna('UNK')
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')

In [8]:
# For medical_specialty, pick 10 most common ones - as we have many categories and some are not very common
top_10 = ['UNK','InternalMedicine','Family/GeneralPractice','Emergency/Trauma','Cardiology',
          'Surgery-General','Orthopedics','Nephrology','Orthopedics-Reconstructive','Radiologist']

In [9]:
df['med_spec'] = df['medical_specialty'].copy()
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,id,med_spec
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,UNK,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,0,0,Pediatrics-Endocrinology
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,UNK,UNK,11,5,13,2,0,1,648.0,250.0,V27,6,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,True,0,2,UNK
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,UNK,UNK,44,1,16,0,0,0,8.0,250.43,403,7,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,True,True,0,3,UNK
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,UNK,UNK,51,0,8,0,0,0,197.0,157.0,250,5,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,0,4,UNK
8,12522,48330783,Caucasian,Female,[80-90),,2,1,4,13,UNK,UNK,68,2,28,0,0,0,398.0,427.0,38,8,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,0,8,UNK


In [10]:
# Substitute less common medical_specialty with "Other" category
df.loc[~df.med_spec.isin(top_10), 'med_spec'] = "Other"

In [11]:
# Some discharge disposition ids unfortunately indicate that a patient has passed away. 
# That may make for a useful feature for readmittion prediction
df['death'] = df.discharge_disposition_id.isin([11,13,14,19,20,21]).astype(int)
np.unique(df['death'])

array([0, 1])

In [12]:
cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

In [13]:
df[cols_cat_num] = df[cols_cat_num].astype('str')

In [14]:
df_cat = pd.get_dummies(df[cols_cat + cols_cat_num + ['med_spec']], drop_first=True)

In [15]:
df = pd.concat([df,df_cat], axis=1)

In [16]:
cols_all_cat = list(df_cat.columns)

In [17]:
age_id = {
    '[0-10)':0,
    '[10-20)':1,
    '[20-30)':2,
    '[30-40)':3,
    '[40-50)':4,
    '[50-60)':5,
    '[60-70)':6,
    '[70-80)':7,
    '[80-90)':8,
    '[90-100)':9     
}

df['age_group'] = df.age.replace(age_id)

In [18]:
df['has_weight'] = df['weight'].notnull().astype(int)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66221 entries, 0 to 101764
Columns: 166 entries, encounter_id to has_weight
dtypes: bool(4), int64(58), object(16), uint8(88)
memory usage: 43.7+ MB


In [20]:
cols_extra = ['age_group', 'has_weight']

In [21]:
cols_death = ['death']

In [22]:
col2use = cols_num + cols_all_cat + cols_extra + cols_death

In [23]:
#Let's have a look at all the feature columns that can be used
df[col2use].head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,metformin.1,repaglinide,repaglinide.1,nateglinide,nateglinide.1,chlorpropamide,chlorpropamide.1,glimepiride,glimepiride.1,acetohexamide,acetohexamide.1,glipizide,glipizide.1,glyburide,glyburide.1,tolbutamide,tolbutamide.1,pioglitazone,pioglitazone.1,rosiglitazone,rosiglitazone.1,acarbose,acarbose.1,miglitol,miglitol.1,troglitazone,troglitazone.1,tolazamide,tolazamide.1,insulin,insulin.1,glyburide-metformin,glyburide-metformin.1,glipizide-metformin,glipizide-metformin.1,glimepiride-pioglitazone,glimepiride-pioglitazone.1,metformin-rosiglitazone,metformin-rosiglitazone.1,metformin-pioglitazone,metformin-pioglitazone.1,change,change.1,diabetesMed,diabetesMed.1,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_UNK,gender_Male,gender_Unknown/Invalid,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,payer_code_CH,payer_code_CM,payer_code_CP,payer_code_DM,payer_code_FR,payer_code_HM,payer_code_MC,payer_code_MD,payer_code_MP,payer_code_OG,payer_code_OT,payer_code_PO,payer_code_SI,payer_code_SP,payer_code_UN,payer_code_UNK,payer_code_WC,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_2,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_2,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,med_spec_Emergency/Trauma,med_spec_Family/GeneralPractice,med_spec_InternalMedicine,med_spec_Nephrology,med_spec_Orthopedics,med_spec_Orthopedics-Reconstructive,med_spec_Other,med_spec_Radiologist,med_spec_Surgery-General,med_spec_UNK,age_group,has_weight,death
0,1,41,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,11,5,13,2,0,1,6,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,True,True,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0
3,2,44,1,16,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0
4,1,51,0,8,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,4,0,0
8,13,68,2,28,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,0,0


In [24]:
#Dividing the data into test and train again
train = df[df.readmitted != '']
test = df[df.readmitted == '']

In [25]:
print(train.shape)
print(test.shape)

(33051, 166)
(33170, 166)


In [26]:
df_data = train[col2use + ['readmitted']]
df_data.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,metformin.1,repaglinide,repaglinide.1,nateglinide,nateglinide.1,chlorpropamide,chlorpropamide.1,glimepiride,glimepiride.1,acetohexamide,acetohexamide.1,glipizide,glipizide.1,glyburide,glyburide.1,tolbutamide,tolbutamide.1,pioglitazone,pioglitazone.1,rosiglitazone,rosiglitazone.1,acarbose,acarbose.1,miglitol,miglitol.1,troglitazone,troglitazone.1,tolazamide,tolazamide.1,insulin,insulin.1,glyburide-metformin,glyburide-metformin.1,glipizide-metformin,glipizide-metformin.1,glimepiride-pioglitazone,glimepiride-pioglitazone.1,metformin-rosiglitazone,metformin-rosiglitazone.1,metformin-pioglitazone,metformin-pioglitazone.1,change,change.1,diabetesMed,diabetesMed.1,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_UNK,gender_Male,gender_Unknown/Invalid,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,payer_code_CH,payer_code_CM,payer_code_CP,payer_code_DM,payer_code_FR,payer_code_HM,payer_code_MC,payer_code_MD,payer_code_MP,payer_code_OG,payer_code_OT,payer_code_PO,payer_code_SI,payer_code_SP,payer_code_UN,payer_code_UNK,payer_code_WC,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_2,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_2,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,med_spec_Emergency/Trauma,med_spec_Family/GeneralPractice,med_spec_InternalMedicine,med_spec_Nephrology,med_spec_Orthopedics,med_spec_Orthopedics-Reconstructive,med_spec_Other,med_spec_Radiologist,med_spec_Surgery-General,med_spec_UNK,age_group,has_weight,death,readmitted
0,1,41,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,False,False,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,2,11,5,13,2,0,1,6,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,False,True,True,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0
3,2,44,1,16,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0
4,1,51,0,8,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,4,0,0,0
8,13,68,2,28,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,True,True,True,True,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,0,0,0


In [27]:
#Mix data into random order
df_data = df_data.sample(n= len(df_data), random_state=42)
df_data = df_data.reset_index(drop=True)

In [28]:
df_valid_test = df_data.sample(frac=0.30,random_state=42)
len(df_valid_test)/len(df_data)

0.29999092311881637

In [29]:
#Divade data into train,test and validation sets
df_test = df_valid_test.sample(frac=0.50,random_state=42)
df_valid = df_valid_test.drop(df_test.index)

In [30]:
df_train_all = df_data.drop(df_valid_test.index)

In [31]:
#Determine the prevalence of readmitted cases in each dataset
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [32]:
print(calc_prevalence(df_test['readmitted'].values))
print(calc_prevalence(df_valid['readmitted'].values))
print(calc_prevalence(df_train_all['readmitted'].values))

17.365873336022588
17.207988702844464
17.31932918395574


In [33]:
df_train = df_train_all

In [34]:
#Select features of interest based on correlation with readmitted_cat column
feats_special = ['number_inpatient','number_diagnoses', 'number_emergency', 'time_in_hospital', 
'discharge_disposition_id_3', 'discharge_disposition_id_22', 'insulin', 'num_medications', 'number_outpatient',
'admission_source_id_7', 'diabetesMed', 'age_group', 'med_spec_Nephrology', 'discharge_disposition_id_6',
'discharge_disposition_id_2', 'discharge_disposition_id_5', 'med_spec_UNK', 'discharge_disposition_id_2',
'discharge_disposition_id_28', 'payer_code_MC', 'A1Cresult_None', 'race_Caucasian', 'has_weight',
'admission_type_id_6', 'max_glu_serum_>300', 'glipizide', 'repaglinide']

In [35]:
X_train = df_train[feats_special].values
X_train_all = df_train_all[feats_special].values
X_valid = df_valid[feats_special].values
X_test = df_test[feats_special].values

y_train = df_train['readmitted'].values.astype(np.int8)
y_valid = df_valid['readmitted'].values.astype(np.int8)
y_test = df_test['readmitted'].values.astype(np.int8)

In [36]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
print(X_train_all.shape)

(23136, 31)
(4957, 31)
(4958, 31)
(23136, 31)


### Training the model

In [37]:
model = xgb.XGBClassifier()
cv = StratifiedKFold(n_splits=10)
scores = []
for train_idx, test_idx in cv.split(X_train, y_train):
    X_fold_train, X_fold_test = X_train[train_idx], X_train[test_idx]
    y_fold_train, y_fold_test = y_train[train_idx], y_train[test_idx]
    
    y_log = np.log2(y_fold_train + 3)
    model.fit(X_fold_train, y_log)
    y_pred_log = model.predict(X_fold_test) 
    y_pred = np.exp2(y_pred_log) - 3
    y_pred[ y_pred < 0 ] = 0 #reset negative result
    
    score = rmsle(y_fold_test, y_pred)
    scores.append( score )
print('score: ', np.mean(scores), 'std: ', np.std(scores))

score:  1.88 std:  0.01733


In [38]:
y_pred2 = model.predict(X_test) 
score2 = rmsle(y_pred2.astype(np.int8), y_test.astype(np.int8))
score2

1.726

### Making the final prediction for the test data

In [43]:
data = test[feats_special]
readmitted = model.predict(data.values) 
test = test.assign(readmitted = readmitted)
test.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,id,med_spec,death,metformin.1,repaglinide.1,nateglinide.1,chlorpropamide.1,glimepiride.1,acetohexamide.1,glipizide.1,glyburide.1,tolbutamide.1,pioglitazone.1,rosiglitazone.1,acarbose.1,miglitol.1,troglitazone.1,tolazamide.1,insulin.1,glyburide-metformin.1,glipizide-metformin.1,glimepiride-pioglitazone.1,metformin-rosiglitazone.1,metformin-pioglitazone.1,change.1,diabetesMed.1,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_UNK,gender_Male,gender_Unknown/Invalid,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,payer_code_CH,payer_code_CM,payer_code_CP,payer_code_DM,payer_code_FR,payer_code_HM,payer_code_MC,payer_code_MD,payer_code_MP,payer_code_OG,payer_code_OT,payer_code_PO,payer_code_SI,payer_code_SP,payer_code_UN,payer_code_UNK,payer_code_WC,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_2,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_2,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,med_spec_Emergency/Trauma,med_spec_Family/GeneralPractice,med_spec_InternalMedicine,med_spec_Nephrology,med_spec_Orthopedics,med_spec_Orthopedics-Reconstructive,med_spec_Other,med_spec_Radiologist,med_spec_Surgery-General,med_spec_UNK,age_group,has_weight
6,55842,84259809,Caucasian,Male,[60-70),,3,1,2,4,UNK,UNK,70,1,21,0,0,0,414,411,V45,7,,,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,1.584961,6,UNK,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,0
9,15738,63555939,Caucasian,Female,[90-100),,3,3,4,12,UNK,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,True,True,1.584961,9,InternalMedicine,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,True,True,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,9,0
13,42570,77586282,Caucasian,Male,[80-90),,1,6,7,10,UNK,Family/GeneralPractice,55,1,31,0,0,0,428,411,427,8,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,False,True,1.584961,13,Family/GeneralPractice,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,False,True,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,8,0
15,73578,86328819,AfricanAmerican,Male,[60-70),,1,3,7,12,UNK,UNK,75,5,13,0,0,0,999,507,996,9,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,True,True,1.584961,15,UNK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,True,True,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,6,0
16,77076,92519352,AfricanAmerican,Male,[50-60),,1,1,7,4,UNK,UNK,45,4,17,0,0,0,410,411,414,8,,,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,1.584961,16,UNK,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,True,True,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,5,0


In [44]:
test['readmitted'].head()

6     1.584961
9     1.584961
13    1.584961
15    1.584961
16    1.584961
Name: readmitted, dtype: float16

In [45]:
#Saving results to the .csv file
test[ ['id', 'readmitted'] ].to_csv('../output/lets_improve_it.csv', index=False)