In [1]:
# system utility packages
import os
import pickle
import re
import subprocess

import numpy as np
import pandas as pd

# global settings
pd.options.display.max_columns = 999 # enable display of all columns in notebook
#np.random.seed(12345) # for reproducibility of the many things that call numpy


  from numpy.core.umath_tests import inner1d


#  Explore and prepare diabetes data

#### Import data and examine Kaggle diabetes data set
The original data is available here: https://www.kaggle.com/brandao/diabetes. The data set contains demographic and medical information about many patients. It will be used to create a white-box classifier for predicting who will be readmitted to a hospital within 30 days of discharge. This notebook attempts to showcase the use of a complex, but transparent, nonlinear classifier as an alternative to more traditional linear model approaches.

In [2]:
# import CSV file
# set numeric categoricals to objects in train and test

train = pd.read_csv('data/diabetes_train.csv')
train['discharge_disposition_id'] = train['discharge_disposition_id'].astype('object')
train['admission_type_id'] = train['admission_type_id'].astype('object')
train['admission_source_id'] = train['admission_source_id'].astype('object')

test = pd.read_csv('data/diabetes_test.csv')
test['discharge_disposition_id'] = test['discharge_disposition_id'].astype('object')
test['admission_type_id'] = test['admission_type_id'].astype('object')
test['admission_source_id'] = test['admission_source_id'].astype('object')

#### Examine data to assess quality issues
`XGBoost` handles missing values nicely. But a few other problems can be seen:
* High cardinality categorical variables 
* Constant variables 
* Character variables (`XGBoost` handles character variables in a naive way)

In [3]:
train.head(n=10) # display first n= rows

Unnamed: 0,id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
2,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
3,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
4,55842,84259809,Caucasian,Male,[60-70),?,3,1,2,4,?,?,70,1,21,0,0,0,414.0,411,V45,7,,,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,12522,48330783,Caucasian,Female,[80-90),?,2,1,4,13,?,?,68,2,28,0,0,0,398.0,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
6,15738,63555939,Caucasian,Female,[90-100),?,3,3,4,12,?,InternalMedicine,33,3,18,0,0,0,434.0,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,36900,77391171,AfricanAmerican,Male,[60-70),?,2,1,4,7,?,?,62,0,11,0,0,0,157.0,288,197,7,,,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,<30
8,40926,85504905,Caucasian,Female,[40-50),?,1,3,7,7,?,Family/GeneralPractice,60,0,15,0,1,0,428.0,250.43,250.6,8,,,Steady,Up,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
9,42570,77586282,Caucasian,Male,[80-90),?,1,6,7,10,?,Family/GeneralPractice,55,1,31,0,0,0,428.0,411,427,8,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO


#### Examine high cardinality inputs
One obvious problem with the data is the high cardinality categorical variables:

* `discharge_disposition_id`
* `medical_specialty`
* `diag_1`
* `diag_2`
* `diag_3`

have the most levels and will be binned.

In [4]:
for col in train.select_dtypes(include=['object']):    # loop through categorical variables
    n_levels = len(train[col].unique())                # determine number of unique levels
    if n_levels > 20:                                  # if unique levels > 20, then print
        print(col + ' levels: ', train[col].unique())
        print()

discharge_disposition_id levels:  [25 1 3 6 11 5 2 4 10 7 14 8 18 13 12 22 17 23 9 16 20 15 24 28 19 27]

medical_specialty levels:  ['Pediatrics-Endocrinology' '?' 'InternalMedicine'
 'Family/GeneralPractice' 'Cardiology' 'Surgery-General' 'Orthopedics'
 'Gastroenterology' 'Nephrology' 'Orthopedics-Reconstructive'
 'Surgery-Cardiovascular/Thoracic' 'Pulmonology' 'Psychiatry'
 'Emergency/Trauma' 'Surgery-Neuro' 'Obsterics&Gynecology-GynecologicOnco'
 'ObstetricsandGynecology' 'Pediatrics' 'Otolaryngology'
 'Surgery-Colon&Rectal' 'Urology' 'Psychiatry-Child/Adolescent'
 'Endocrinology' 'Neurology' 'Anesthesiology-Pediatric'
 'Pediatrics-CriticalCare' 'Radiology' 'Pediatrics-Hematology-Oncology'
 'Psychology' 'Hematology/Oncology' 'Podiatry' 'Gynecology' 'Oncology'
 'Pediatrics-Neurology' 'Surgery-Plastic' 'Surgery-Thoracic'
 'Surgery-Pediatric' 'Pediatrics-EmergencyMedicine'
 'PhysicalMedicineandRehabilitation' 'Pediatrics-Pulmonology'
 'Anesthesiology' 'Ophthalmology' 'InfectiousDiseas

 '750']

diag_3 levels:  ['?' 'V27' '403' '250' 'V45' '38' '486' '197' '250.6' '427' '996' '414'
 '714' '428' '582' '250.42' '998' '250.01' '618' '250.02' '305' '496'
 '599' '424' '553' '401' '794' '511' '276' '490' '562' '482' '250.7' '518'
 '411' '784' '491' '420' '8' '730' '131' '707' '41' '493' '529' '263'
 'E888' '425' '595' '560' '711' '296' '789' 'V43' '250.4' '654' 'V70'
 '625' '681' '250.51' 'V10' '280' '440' '244' '581' '569' '272' '250.43'
 '918' '54' '250.41' '788' '196' '461' '535' '584' '891' '332' '648'
 '250.03' '780' '182' '285' '278' '997' '413' '664' '201' 'V15' '292'
 '416' '473' '564' 'E932' '357' '348' 'E878' '437' '733' '525' '250.53'
 '397' '572' '805' '453' '331' '736' '250.8' '465' '533' '787' '349' '315'
 '658' '608' '786' '284' '382' '300' 'V42' '492' '438' '571' '536' '585'
 '644' '578' '682' 'V11' 'E885' '162' '198' '303' '412' '402' '396' 'V14'
 '570' '433' 'E934' '882' '288' '458' '785' '577' '729' '836' '799' '281'
 '616' '304' '250.83' '291' '512' '660

#### Utility function for simple binning of less common categorical levels
This function handles high cardinality categorical variables by binning all less common levels into an 'Other' bin. This is a simple, interpretable method of handling high cardinality categorical variables. Each of these levels will end up being it's own input variable into the readmission classifier. So it's important there are not too many levels and that the levels have a direct, transparent meaning.

In [5]:
def bin_less_common_levels(train, test, col_name, in_list):
    
    """ Places less common categorical levels into an 'Other' bin.
    
    :param train: Training set.
    :param test: Test set.
    :param col_name: Name of column in which to create 'Other' bin.
    :param in_list: List of levels NOT to be binned.
    
    """
    
    # if the level is not in in_list, set it to 'Other'
    train.loc[~train[col_name].isin(in_list), col_name] = 'Other' 
    test.loc[~test[col_name].isin(in_list), col_name] = 'Other'   

    # print summary of changes
    print('Train levels after binning:\n', train[col_name].value_counts())
    print()
    print('Test levels after binning:\n', test[col_name].value_counts())
    

#### Bin `discharge_disposition_id`

In [6]:
in_list = list(train['discharge_disposition_id'].value_counts()[:14].index) # first 14 levels contain reasonable amount of info
bin_less_common_levels(train, test, 'discharge_disposition_id', in_list)    # set all other levels to 'Other'

Train levels after binning:
 1        28733
3         6599
6         5414
18        2656
11        1329
2         1116
22        1076
5          741
25         606
4          475
7          307
Other      257
14         246
13         227
23         219
Name: discharge_disposition_id, dtype: int64

Test levels after binning:
 1        9985
3        2439
6        2126
22        398
2         344
11        313
13        136
14        119
7          95
5          93
4          62
23         49
Other      47
18          8
25          7
Name: discharge_disposition_id, dtype: int64


#### Bin `medical_specialty`

In [7]:
in_list = list(train['medical_specialty'].value_counts()[:10].index) # first 10 levels contain reasonable amount of info
bin_less_common_levels(train, test, 'medical_specialty', in_list)    # set all other levels to 'Other'

Train levels after binning:
 ?                             20893
InternalMedicine               8899
Other                          5162
Family/GeneralPractice         4370
Cardiology                     3037
Emergency/Trauma               2824
Surgery-General                1649
Orthopedics-Reconstructive      898
Nephrology                      883
Orthopedics                     839
Psychiatry                      547
Name: medical_specialty, dtype: int64

Test levels after binning:
 ?                             10841
Emergency/Trauma               1735
InternalMedicine               1013
Other                           974
Cardiology                      462
Family/GeneralPractice          407
Surgery-General                 404
Orthopedics                     234
Nephrology                       64
Psychiatry                       45
Orthopedics-Reconstructive       42
Name: medical_specialty, dtype: int64


#### Bin `diag_1`

In [8]:
in_list = list(train['diag_1'].value_counts()[:20].index) # first 20 levels contain reasonable amount of info
bin_less_common_levels(train, test, 'diag_1', in_list)    # set all other levels to 'Other'

Train levels after binning:
 Other    25298
414       3464
428       2924
786       2071
410       1992
486       1662
427       1256
715       1103
434       1103
780        974
682        936
996        909
276        895
38         799
491        791
250.8      709
599        689
584        677
V57        651
820        590
518        508
Name: diag_1, dtype: int64

Test levels after binning:
 Other    8061
414       993
428       849
410       557
427       536
786       527
715       480
486       477
491       411
434       337
38        335
682       332
584       319
780       315
599       289
996       287
276       281
250.8     269
518       240
820       195
V57       131
Name: diag_1, dtype: int64


#### Bin `diag_2`

In [9]:
in_list = list(train['diag_2'].value_counts()[:20].index) # first 20 levels contain reasonable amount of info
bin_less_common_levels(train, test, 'diag_2', in_list)    # set all other levels to 'Other'

Train levels after binning:
 Other     20558
250        3425
276        3399
428        3233
427        2564
401        2094
496        1681
599        1570
403        1373
411        1358
414        1248
250.02     1060
250.01      944
707         879
780         794
285         667
682         646
518         641
425         639
491         630
424         598
Name: diag_2, dtype: int64

Test levels after binning:
 Other     8346
250        966
276        961
428        750
427        643
401        607
599        555
414        503
285        350
518        311
707        302
411        285
496        283
250.02     255
491        217
682        210
425        199
780        171
403        164
424         88
250.01      55
Name: diag_2, dtype: int64


#### Bin `diag_3`

In [10]:
in_list = list(train['diag_3'].value_counts()[:20].index) # first 20 levels contain reasonable amount of info
bin_less_common_levels(train, test, 'diag_3', in_list)    # set all other levels to 'Other'

Train levels after binning:
 Other     20338
250        6212
401        4473
276        2516
428        2102
427        1906
414        1701
496        1336
403        1023
272        1019
?           985
599         927
585         725
V45         686
780         662
707         640
250.02      631
285         558
250.01      541
424         522
250.6       498
Name: diag_3, dtype: int64

Test levels after binning:
 Other     7415
250       1777
401       1331
276        868
414        612
428        603
427        545
585        412
272        388
403        347
599        319
496        270
285        228
250.02     220
780        207
707        191
V45        143
250.6      124
424        101
?          100
250.01      20
Name: diag_3, dtype: int64


#### Check binning results
Now all of the categorical variables in the data set have a reasonable number of unique levels for modeling.

In [11]:
for col in train.select_dtypes(include=['object']): # loop through categorical vars
    n_levels = len(train[col].unique())             # find number of unique levels
    print(col, n_levels)                            # print

race 6
gender 3
age 10
weight 9
admission_type_id 8
discharge_disposition_id 15
admission_source_id 14
payer_code 17
medical_specialty 11
diag_1 21
diag_2 21
diag_3 21
max_glu_serum 4
A1Cresult 4
metformin 4
repaglinide 4
nateglinide 4
chlorpropamide 4
glimepiride 4
acetohexamide 1
glipizide 4
glyburide 4
tolbutamide 2
pioglitazone 4
rosiglitazone 4
acarbose 4
miglitol 4
troglitazone 2
tolazamide 2
examide 1
citoglipton 1
insulin 4
glyburide-metformin 4
glipizide-metformin 2
glimepiride-pioglitazone 1
metformin-rosiglitazone 1
metformin-pioglitazone 1
change 2
diabetesMed 2
readmitted 2


#### Define target and variables to be dropped from the analysis
Different lists of variables are assigned to handle converting character variables into numeric dummy variables. It's pointless to encode constant variables and the categorical target variable, `readmit`, will be handled on it's own later.

In [12]:
# constant column
constants = ['acetohexamide', 'examide', 'citoglipton', 'citoglipton', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']

y = 'readmitted' # modeling prediction target

# python sets allow for subtraction, lists do not
# used here to find the categorical variables that should be dummy-encoded for modeling
# convert back to a list for later use 
encodes = list(set(train.select_dtypes(include=['object']).columns) - set(constants + [y]))

#### Dummy-encode categorical variables into numeric variables
The built-in `Pandas` function `get_dummies()` creates new binary variables for each unique level in an original categorical variable. Each of these columns can be used directly in `XGBoost`.

In [13]:
# drop the original categorical variables
# then join the dummy-encoded versions of the same categorical variables back into the data

train = pd.concat([train.drop(encodes, axis=1),
                   pd.get_dummies(train[encodes])],
                   axis = 1)

test = pd.concat([test.drop(encodes, axis=1),
                  pd.get_dummies(test[encodes])],
                   axis = 1)

#### Ensure train and test have the same input variables after encoding
If a variable has different levels in the train and test sets this will result in different variables in the new, encoded train and test sets. This can cause problems with over-optimistic accuracy in training and errors when attempting to score unseen levels in the test set. 

In [14]:
# in case there were any categorical variable levels in test and not in train
# (or vice-versa)
# drop them

in_train_not_test = list(set(train.columns)-set(test.columns)) # set subtraction
train.drop(in_train_not_test, axis=1, inplace=True)

in_test_not_train = list(set(test.columns)-set(train.columns)) # set subtraction
test.drop(in_test_not_train, axis=1, inplace=True)

#### Ensure names are safe for `XGBoost`
`XGBoost` and `Pandas` have different variable name requirements. As a consequence of the dummy encoding process, some encoded variables were created with non-alphanumeric characters in their names.

In [15]:
for name in train.columns:
    
    # use python replace function to replace common '_?' suffix
    # use regex to catch everything else
    
    train.rename(columns={name: name.replace('_?', '_q')}, inplace=True)   
    train.rename(columns={name: re.sub('[^0-9a-zA-Z]+', '_', name)}, inplace=True)
    
    test.rename(columns={name: name.replace('_?', '_q')}, inplace=True)
    test.rename(columns={name: re.sub('[^0-9a-zA-Z]+', '_', name)}, inplace=True)
    

#### Check encodig results

In [16]:
print('All train and test columns match:', all(train.columns == test.columns)) # test all names match
print('Train set shape:', train.shape)                                         # test number of columns match
print('Test set shape:', test.shape)                                           # test number of columns match

All train and test columns match: True
Train set shape: (50001, 233)
Test set shape: (16221, 233)


#### Assign modeling roles
The constants list must be redefined after handling the names for `XGBoost`. The inputs to the classifier, `X`, are defined to be all of the original numeric variables and all of the new encoded variables that are not patient identifiers or constant.

In [17]:
# names of drops were changed in steps above, must redefine ('-' became '_')
constants = ['acetohexamide', 'examide', 'citoglipton', 'citoglipton', 'glimepiride_pioglitazone', 'metformin_rosiglitazone', 
             'metformin_pioglitazone'] 

# everything that is not constant, an identifier, or the modeling target will be a modeling input
X = [name for name in train.columns if name not in [y] + constants + ['id', 'patient_nbr']]

# print summary
print('y =', y)
print('X =', X)

y = readmitted
X = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'glipizide_Down', 'glipizide_No', 'glipizide_Steady', 'glipizide_Up', 'miglitol_Down', 'miglitol_No', 'miglitol_Steady', 'A1Cresult_7', 'A1Cresult_8', 'A1Cresult_None', 'A1Cresult_Norm', 'insulin_Down', 'insulin_No', 'insulin_Steady', 'insulin_Up', 'chlorpropamide_No', 'chlorpropamide_Steady', 'glimepiride_Down', 'glimepiride_No', 'glimepiride_Steady', 'glimepiride_Up', 'gender_Female', 'gender_Male', 'gender_Unknown_Invalid', 'tolbutamide_No', 'tolbutamide_Steady', 'nateglinide_Down', 'nateglinide_No', 'nateglinide_Steady', 'nateglinide_Up', 'diag_1_250_8', 'diag_1_276', 'diag_1_38', 'diag_1_410', 'diag_1_414', 'diag_1_427', 'diag_1_428', 'diag_1_434', 'diag_1_486', 'diag_1_491', 'diag_1_518', 'diag_1_584', 'diag_1_599', 'diag_1_682', 'diag_1_715', 'diag_1_780', 'diag_1_786', 'diag_1_820', 'diag_1_996', 'dia

#### Ensure Pandas treats all input variables as numeric

In [18]:
# xgboost treats all columns as numeric - no matter what
# any values that can't be converted easily will be NaN - XGBoost does handle NaN elegantly
train[X] = train[X].apply(pd.to_numeric, errors='coerce', axis=1) 
test[X] = test[X].apply(pd.to_numeric, errors='coerce', axis=1)

#### Manually convert target variable to numeric

Using `get_dummies()` would have resulted in two perfectly negatively correlated copies of the target variable. Instead the target is manually transformed so that patient readmission results in higher numeric probabilities being generated by the classifier.

In [19]:
# convert target to numeric value
# readmit = NO -> 0
# readmit = YES -> 1
train.loc[train[y] == 'NO', y] = '0'
train.loc[train[y] != '0', y] = '1'
train[y] = train[y].apply(pd.to_numeric)

test.loc[test[y] == 'NO', y] = '0'
test.loc[test[y] != '0', y] = '1'
test[y] = test[y].apply(pd.to_numeric)

#### Investigate all data preprocessing for train set

In [20]:
train[X + [y]].head() # print only columns used in modeling

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,glipizide_Down,glipizide_No,glipizide_Steady,glipizide_Up,miglitol_Down,miglitol_No,miglitol_Steady,A1Cresult_7,A1Cresult_8,A1Cresult_None,A1Cresult_Norm,insulin_Down,insulin_No,insulin_Steady,insulin_Up,chlorpropamide_No,chlorpropamide_Steady,glimepiride_Down,glimepiride_No,glimepiride_Steady,glimepiride_Up,gender_Female,gender_Male,gender_Unknown_Invalid,tolbutamide_No,tolbutamide_Steady,nateglinide_Down,nateglinide_No,nateglinide_Steady,nateglinide_Up,diag_1_250_8,diag_1_276,diag_1_38,diag_1_410,diag_1_414,diag_1_427,diag_1_428,diag_1_434,diag_1_486,diag_1_491,diag_1_518,diag_1_584,diag_1_599,diag_1_682,diag_1_715,diag_1_780,diag_1_786,diag_1_820,diag_1_996,diag_1_Other,diag_1_V57,diag_2_250,diag_2_250_01,diag_2_250_02,diag_2_276,diag_2_285,diag_2_401,diag_2_403,diag_2_411,diag_2_414,diag_2_424,diag_2_425,diag_2_427,diag_2_428,diag_2_491,diag_2_496,diag_2_518,diag_2_599,diag_2_682,diag_2_707,diag_2_780,diag_2_Other,repaglinide_Down,repaglinide_No,repaglinide_Steady,repaglinide_Up,troglitazone_No,race_q,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,glyburide_Down,glyburide_No,glyburide_Steady,glyburide_Up,tolazamide_No,glyburide_metformin_No,glyburide_metformin_Steady,weight_q,weight_0_25_,weight_100_125_,weight_125_150_,weight_150_175_,weight_175_200_,weight_25_50_,weight_50_75_,weight_75_100_,glipizide_metformin_No,admission_type_id_1,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,change_Ch,change_No,acarbose_No,acarbose_Steady,diag_3_250,diag_3_250_01,diag_3_250_02,diag_3_250_6,diag_3_272,diag_3_276,diag_3_285,diag_3_401,diag_3_403,diag_3_414,diag_3_424,diag_3_427,diag_3_428,diag_3_496,diag_3_585,diag_3_599,diag_3_707,diag_3_780,diag_3_q,diag_3_Other,diag_3_V45,discharge_disposition_id_1,discharge_disposition_id_2,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_11,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_18,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_25,discharge_disposition_id_Other,pioglitazone_Down,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,max_glu_serum_200,max_glu_serum_300,max_glu_serum_None,max_glu_serum_Norm,diabetesMed_No,diabetesMed_Yes,age_0_10_,age_10_20_,age_20_30_,age_30_40_,age_40_50_,age_50_60_,age_60_70_,age_70_80_,age_80_90_,age_90_100_,payer_code_q,payer_code_BC,payer_code_CH,payer_code_CM,payer_code_CP,payer_code_DM,payer_code_HM,payer_code_MC,payer_code_MD,payer_code_MP,payer_code_OG,payer_code_OT,payer_code_PO,payer_code_SI,payer_code_SP,payer_code_UN,payer_code_WC,admission_source_id_1,admission_source_id_2,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,admission_source_id_14,admission_source_id_17,admission_source_id_22,medical_specialty_q,medical_specialty_Cardiology,medical_specialty_Emergency_Trauma,medical_specialty_Family_GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Nephrology,medical_specialty_Orthopedics,medical_specialty_Orthopedics_Reconstructive,medical_specialty_Other,medical_specialty_Psychiatry,medical_specialty_Surgery_General,rosiglitazone_Down,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,metformin_Down,metformin_No,metformin_Steady,metformin_Up,readmitted
0,1,41,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
1,2,11,5,13,2,0,1,6,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
2,2,44,1,16,0,0,0,7,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
3,1,51,0,8,0,0,0,5,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
4,4,70,1,21,0,0,0,7,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


#### Investigate all data preprocessing for test set 

In [23]:
test[X + [y]].head() # print only columns used in modeling

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,insulin_Down,insulin_No,insulin_Steady,insulin_Up,discharge_disposition_id_1,discharge_disposition_id_2,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_11,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_18,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_25,discharge_disposition_id_Other,A1Cresult_7,A1Cresult_8,A1Cresult_None,A1Cresult_Norm,metformin_Down,metformin_No,metformin_Steady,metformin_Up,admission_type_id_1,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,medical_specialty_q,medical_specialty_Cardiology,medical_specialty_Emergency_Trauma,medical_specialty_Family_GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Nephrology,medical_specialty_Orthopedics,medical_specialty_Orthopedics_Reconstructive,medical_specialty_Other,medical_specialty_Psychiatry,medical_specialty_Surgery_General,payer_code_q,payer_code_BC,payer_code_CH,payer_code_CM,payer_code_CP,payer_code_DM,payer_code_HM,payer_code_MC,payer_code_MD,payer_code_MP,payer_code_OG,payer_code_OT,payer_code_PO,payer_code_SI,payer_code_SP,payer_code_UN,payer_code_WC,glyburide_Down,glyburide_No,glyburide_Steady,glyburide_Up,nateglinide_Down,nateglinide_No,nateglinide_Steady,nateglinide_Up,race_q,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,miglitol_Down,miglitol_No,miglitol_Steady,age_0_10_,age_10_20_,age_20_30_,age_30_40_,age_40_50_,age_50_60_,age_60_70_,age_70_80_,age_80_90_,age_90_100_,gender_Female,gender_Male,gender_Unknown_Invalid,tolazamide_No,tolbutamide_No,tolbutamide_Steady,repaglinide_Down,repaglinide_No,repaglinide_Steady,repaglinide_Up,glipizide_Down,glipizide_No,glipizide_Steady,glipizide_Up,glimepiride_Down,glimepiride_No,glimepiride_Steady,glimepiride_Up,max_glu_serum_200,max_glu_serum_300,max_glu_serum_None,max_glu_serum_Norm,chlorpropamide_No,chlorpropamide_Steady,acarbose_No,acarbose_Steady,glipizide_metformin_No,admission_source_id_1,admission_source_id_2,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,admission_source_id_14,admission_source_id_17,admission_source_id_22,diag_3_250,diag_3_250_01,diag_3_250_02,diag_3_250_6,diag_3_272,diag_3_276,diag_3_285,diag_3_401,diag_3_403,diag_3_414,diag_3_424,diag_3_427,diag_3_428,diag_3_496,diag_3_585,diag_3_599,diag_3_707,diag_3_780,diag_3_q,diag_3_Other,diag_3_V45,change_Ch,change_No,diag_1_250_8,diag_1_276,diag_1_38,diag_1_410,diag_1_414,diag_1_427,diag_1_428,diag_1_434,diag_1_486,diag_1_491,diag_1_518,diag_1_584,diag_1_599,diag_1_682,diag_1_715,diag_1_780,diag_1_786,diag_1_820,diag_1_996,diag_1_Other,diag_1_V57,diag_2_250,diag_2_250_01,diag_2_250_02,diag_2_276,diag_2_285,diag_2_401,diag_2_403,diag_2_411,diag_2_414,diag_2_424,diag_2_425,diag_2_427,diag_2_428,diag_2_491,diag_2_496,diag_2_518,diag_2_599,diag_2_682,diag_2_707,diag_2_780,diag_2_Other,glyburide_metformin_No,glyburide_metformin_Steady,rosiglitazone_Down,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,diabetesMed_No,diabetesMed_Yes,weight_q,weight_0_25_,weight_100_125_,weight_125_150_,weight_150_175_,weight_175_200_,weight_25_50_,weight_50_75_,weight_75_100_,pioglitazone_Down,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,troglitazone_No,readmitted
0,1,50,6,25,0,0,0,9,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,2,1,1,1,0,0,0,6,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2,5,47,6,13,3,0,0,9,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
3,7,75,6,50,0,0,0,9,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
4,3,42,0,18,0,0,0,6,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0


## Train using AML AutoML

In [89]:
#train.to_csv('mytraindata.csv')
#test.to_csv('mytestdata.csv')

In [None]:
#train = pd.read_csv('mytraindata.csv')
#test = pd.read_csv('mytestdata.csv')

In [21]:
import logging

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
import multiprocessing

In [22]:
y_train_array = np.array(train[y].values.tolist())

In [74]:
ws = Workspace.from_config()


experiment = Experiment(ws, "Diabetes_prediction2")
num_iterations = 10
automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             primary_metric = 'AUC_weighted',
                             iteration_timeout_minutes = 10,
                             iterations = num_iterations,
                            # n_cross_validations = 3,
                             max_concurrent_iterations = multiprocessing.cpu_count(),
                             verbosity = logging.INFO,
                             X = train[X], 
                             preprocess = False,
                             y = y_train_array,
                             X_valid = test[X],
                             y_valid = test[y],
                             model_explainability = False,
                             blacklist_models=["LightGBM", "LogisticRegression"],
                             path = './')

Found the config file in: C:\Users\joyadmin\notebooks\AzureML\how-to-use-azureml\config.json


In [75]:
local_run = experiment.submit(automl_config, show_output = True)

Running on local machine
Parent Run ID: AutoML_910c2a8e-58db-4b9d-b425-400f5cef01c8
********************************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
SAMPLING %: Percent of the training data to sample.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
********************************************************************************************************************

 ITERATION   PIPELINE                                       SAMPLING %  DURATION      METRIC      BEST
         0   MaxAbsScaler ExtremeRandomTrees                100.0000    0:00:38       0.7123    0.7123
         1   StandardScalerWrapper XGBoostClassifier        100.0000    0:00:22       0.7286    0.7286
         2   StandardScalerWrapper XGBoostClassifier        100

In [76]:
best_run, fitted_model = local_run.get_output()

In [77]:
fitted_model.steps

In [106]:
#explain_model = fitted_model.steps[-1][1].model
explain_model = fitted_model

## Saving the model as pkl file

In [161]:
type(explain_model)

sklearn.pipeline.Pipeline

In [164]:
explain_model

Pipeline(memory=None,
     steps=[('prefittedsoftvotingclassifier', PreFittedSoftVotingClassifier(classification_labels=None,
               estimators=[('XGBoostClassifier_7', Pipeline(memory=None,
     steps=[('StandardScalerWrapper', <automl.client.core.common.model_wrappers.StandardScalerWrapper object at 0x000002031E0529...x0000020314892A58>)]))],
               flatten_transform=None, weights=[0.1, 0.3, 0.4, 0.1, 0.1]))])

In [165]:
from sklearn.externals import joblib

In [168]:
# save model
joblib.dump(explain_model, 'readmit_xgboost.pkl')
# load model
mymodel= joblib.load('readmit_xgboost.pkl')

In [169]:
mymodel

Pipeline(memory=None,
     steps=[('prefittedsoftvotingclassifier', PreFittedSoftVotingClassifier(classification_labels=None,
               estimators=[('XGBoostClassifier_7', Pipeline(memory=None,
     steps=[('StandardScalerWrapper', <automl.client.core.common.model_wrappers.StandardScalerWrapper object at 0x00000203152AAE...x000002038BEAABA8>)]))],
               flatten_transform=None, weights=[0.1, 0.3, 0.4, 0.1, 0.1]))])

## Explain the model

In [138]:
from azureml.contrib.explain.model.tabular_explainer import TabularExplainer

In [170]:
classes = [0,1] 
classes

In [139]:
tabular_explainer = TabularExplainer(explain_model, test[X], features=train[X].columns.values, classes=classes)

In [140]:
# Note this step take a long time to run!
global_explanation = tabular_explainer.explain_global(test[X])

In [145]:
# unsorted feature shap values for all features and all data points in the training data; reflects the original feature order

print(len(global_explanation.local_importance_values))
print(len(global_explanation.local_importance_values[0]))
print(type(global_explanation.local_importance_values[0]))
global_explanation.local_importance_values[0][0]

2
16221
<class 'list'>


## Saving the global_explanation.local_importance_values for later use

In [149]:
localexplaindf = pd.DataFrame(global_explanation.local_importance_values[1])

In [150]:
localexplaindf.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223
0,-0.01,0.0,-0.0,0.01,-0.0,0.0,-0.07,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.0,-0.0,0.0,-0.01,-0.0,0.0,-0.07,-0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,-0.01,-0.0,0.03,0.0,-0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,0.0,0.0,0.0,0.0,0.0,-0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.01,0.0,-0.0,-0.02,-0.0,0.0,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02,0.0,0.0,-0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.01,-0.01,0.01,0.01,-0.0,0.0,-0.08,-0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
localexplaindf.to_csv('ReadmitYesExplain.csv')

In [152]:
localexplain_no_df = pd.DataFrame(global_explanation.local_importance_values[0])

In [154]:
localexplain_no_df.to_csv('ReadmitNoExplain.csv')

In [156]:
import pickle

In [157]:
with open('global_explanation.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(global_explanation.local_importance_values, f, pickle.HIGHEST_PROTOCOL)

In [158]:
with open('global_explanation.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    myexplanationdata = pickle.load(f)

In [160]:
print(len(myexplanationdata))
print(len(myexplanationdata[0]))
print(type(myexplanationdata[0]))
myexplanationdata[0][0]

2
16221
<class 'list'>


## Checking the global importance values

In [171]:
sorted_global_importance_values = global_explanation.get_ranked_global_values()
sorted_global_importance_names = global_explanation.get_ranked_global_names()

print('sorted global importance values: {}'.format(sorted_global_importance_values))
print('sorted global importance names: {}'.format(sorted_global_importance_names))

sorted global importance values: [0.07520926497852265, 0.009818214038844697, 0.009795514711674638, 0.008835057848822773, 0.008508543544864456, 0.00822354029404887, 0.006994728702400003, 0.006079999101156303, 0.004827963878245382, 0.004709573024116339, 0.004203094580776454, 0.0038425836293325263, 0.003703982182238702, 0.0032907226211641894, 0.0028840433899796365, 0.002853674007420738, 0.0025057005687768373, 0.0021586577225460035, 0.0021504886147014042, 0.002128423821091786, 0.0020297374182013843, 0.0020275226487959006, 0.001889068616516817, 0.00152975334585118, 0.0015162496965315474, 0.0014816566233089288, 0.0014005748928385873, 0.0013754089568021946, 0.0013248882534694641, 0.00124970623101172, 0.0012432449941682457, 0.0011578458531866502, 0.0011261097592575769, 0.0009645051022838162, 0.0009492837187660105, 0.0009388007444040007, 0.0009332326907642941, 0.0008960147613386113, 0.0008413008624645391, 0.0008384891747306182, 0.0008337052975257519, 0.0007973782852224618, 0.0007329811000339886

sorted global importance names: ['number_inpatient', 'num_medications', 'discharge_disposition_id_1', 'admission_source_id_7', 'number_outpatient', 'number_emergency', 'number_diagnoses', 'medical_specialty_q', 'time_in_hospital', 'diabetesMed_No', 'discharge_disposition_id_11', 'discharge_disposition_id_22', 'num_lab_procedures', 'metformin_No', 'diag_1_428', 'num_procedures', 'insulin_Down', 'diag_3_250', 'age_50_60_', 'metformin_Steady', 'admission_type_id_2', 'discharge_disposition_id_3', 'race_Caucasian', 'diag_3_Other', 'age_80_90_', 'payer_code_MC', 'admission_source_id_1', 'discharge_disposition_id_13', 'insulin_Up', 'diag_2_250', 'age_70_80_', 'A1Cresult_Norm', 'diag_3_403', 'payer_code_q', 'insulin_No', 'diag_3_585', 'discharge_disposition_id_14', 'A1Cresult_None', 'insulin_Steady', 'diabetesMed_Yes', 'payer_code_BC', 'admission_type_id_1', 'admission_source_id_4', 'diag_1_486', 'diag_3_401', 'diag_3_250_6', 'age_60_70_', 'diag_1_786', 'payer_code_SP', 'payer_code_HM', 'race_