### Imports

In [2]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
sns.set_palette("tab20")

#!pip install imblearn
# !pip uninstall scikit-learn
# !pip install scikit-learn==1.2.2
#from imblearn.over_sampling import SMOTENC
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import  LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, accuracy_score, roc_auc_score
from tqdm import tqdm
import random
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold

import category_encoders as ce
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
sns.set_palette("tab20")

random_state = 42
np.random.seed(42)

## Additional Preprocessing for Multiclass

In [3]:
df_train = pd.read_csv('Data/df_train_preprocessed.csv')

In [4]:
#encoding for multiclass variable
mapping = {'No': 0, '>30 days': 1, '<30 days': 2}
df_train['readmitted_multiclass'] = df_train['readmitted_multiclass'].map(mapping)

### Admission_Source

In [5]:
#Fill NA's with Unknown
df_train['admission_source'] = df_train['admission_source'].fillna("Unknown")

In [6]:
#Calculate the overall proportion for each class in 'readmitted_multiclass'
overall_proportion = df_train['readmitted_multiclass'].value_counts(normalize=True)
overall_proportion

0    0.539123
1    0.349276
2    0.111601
Name: readmitted_multiclass, dtype: float64

In [7]:
#Calculate the proportion of each class within each 'admission_source' and create columns
counts_per_admission_source = df_train.groupby(['admission_source', 'readmitted_multiclass']).size()
total_counts_per_admission_source = df_train['admission_source'].value_counts()
admission_source_proportion = counts_per_admission_source.div(total_counts_per_admission_source, level='admission_source').unstack(fill_value=0)
admission_source_proportion

readmitted_multiclass,0,1,2
admission_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Court/Law Enforcement,0.636364,0.181818,0.181818
Emergency Room,0.506982,0.376398,0.11662
Extramural Birth,1.0,0.0,0.0
Not Available,0.772727,0.147727,0.079545
Physician Referral,0.569881,0.324935,0.105184
Sick Baby,1.0,0.0,0.0
Transfer from Ambulatory Surgery Center,1.0,0.0,0.0
Transfer from a Skilled Nursing Facility (SNF),0.589916,0.282353,0.127731
Transfer from another health care facility,0.727273,0.175416,0.097311
Transfer from critial access hospital,0.857143,0.142857,0.0


In [8]:
df_train['admission_source'].value_counts()

 Emergency Room                                               40319
 Physician Referral                                           20678
Unknown                                                        4825
Transfer from a hospital                                       2230
 Transfer from another health care facility                    1562
Clinic Referral                                                 779
 Transfer from a Skilled Nursing Facility (SNF)                 595
HMO Referral                                                    129
 Not Available                                                   88
 Court/Law Enforcement                                           11
 Transfer from hospital inpt/same fac reslt in a sep claim        8
 Transfer from critial access hospital                            7
 Transfer from Ambulatory Surgery Center                          2
 Extramural Birth                                                 1
Normal Delivery                                 

Group all small categories into 'other'

In [9]:
admission_source_proportion_reset = admission_source_proportion.reset_index()
df_train = df_train.merge(admission_source_proportion_reset, on='admission_source', how='left')

df_train.rename(columns={0: 'admission_source_0', 1:'admission_source_1', 2:'admission_source_2'}, inplace=True)

In [10]:
#create bins comparing the average of readmitted multiclass and the average of each class in admission source
df_train['admission_source_0_high'] = df_train['admission_source_0'].apply(lambda x: 1 
                                                                           if x >= 0.6
                                                                            else 0)

df_train['admission_source_0_low'] = df_train['admission_source_0'].apply(lambda x: 1 
                                                                           if x < 0.5
                                                                            else 0)

df_train['admission_source_1_high'] = df_train['admission_source_1'].apply(lambda x: 1 
                                                                           if x >= 0.4
                                                                            else 0)

df_train['admission_source_1_low'] = df_train['admission_source_1'].apply(lambda x: 1 
                                                                           if x < 0.3
                                                                            else 0)

df_train['admission_source_2_high'] = df_train['admission_source_2'].apply(lambda x: 1 
                                                                           if x >= 0.12
                                                                            else 0)

df_train['admission_source_2_low'] = df_train['admission_source_2'].apply(lambda x: 1 
                                                                           if x < 0.1
                                                                            else 0)

In [11]:
print(df_train['admission_source_0_high'].value_counts())
print(df_train['admission_source_0_low'].value_counts())
print(df_train['admission_source_1_high'].value_counts())
print(df_train['admission_source_1_low'].value_counts())
print(df_train['admission_source_2_high'].value_counts())
print(df_train['admission_source_2_low'].value_counts())


0    66546
1     4690
Name: admission_source_0_high, dtype: int64
0    71236
Name: admission_source_0_low, dtype: int64
0    71236
Name: admission_source_1_high, dtype: int64
0    65951
1     5285
Name: admission_source_1_low, dtype: int64
0    70493
1      743
Name: admission_source_2_high, dtype: int64
0    67344
1     3892
Name: admission_source_2_low, dtype: int64


#### Medical_specialty

In [12]:
# Replacing missing values in 'medical_specialty' with a placeholder 'Unknown'
df_train['medical_specialty'].fillna('Unknown', inplace=True)

In [13]:
#calculate the proportion of each class within each 'medical_specialty' and create columns
counts_per_medical_specialty = df_train.groupby(['medical_specialty', 'readmitted_multiclass']).size()
total_counts_per_medical_specialty = df_train['medical_specialty'].value_counts()
medical_specialty_proportion = counts_per_medical_specialty.div(total_counts_per_medical_specialty, level='medical_specialty').unstack(fill_value=0)
medical_specialty_proportion

readmitted_multiclass,0,1,2
medical_specialty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AllergyandImmunology,0.166667,0.333333,0.500000
Anesthesiology,0.800000,0.200000,0.000000
Anesthesiology-Pediatric,0.666667,0.333333,0.000000
Cardiology,0.572659,0.347955,0.079386
Cardiology-Pediatric,0.200000,0.600000,0.200000
...,...,...,...
Surgery-Thoracic,0.614458,0.289157,0.096386
Surgery-Vascular,0.498630,0.353425,0.147945
SurgicalSpecialty,0.640000,0.280000,0.080000
Unknown,0.518441,0.365701,0.115858


In [14]:
#reset the index
medical_specialty_proportion_reset = medical_specialty_proportion.reset_index()
df_train = df_train.merge(medical_specialty_proportion_reset, on='medical_specialty', how='left')

#rename the columns
df_train.rename(columns={0: 'medical_specialty_0', 1: 'medical_specialty_1', 2: 'medical_specialty_2'}, inplace=True)

In [15]:
#create bins comparing the proportion of each readmitted multiclass and the proportion of each class in medical specialty
df_train['medical_specialty_0_high'] = df_train['medical_specialty_0'].apply(lambda x: 1 
                                                                           if x >= 0.6
                                                                            else 0)

df_train['medical_specialty_0_low'] = df_train['medical_specialty_0'].apply(lambda x: 1 
                                                                           if x < 0.5
                                                                            else 0)

df_train['medical_specialty_1_high'] = df_train['medical_specialty_1'].apply(lambda x: 1 
                                                                           if x >= 0.4
                                                                            else 0)

df_train['medical_specialty_1_low'] = df_train['medical_specialty_1'].apply(lambda x: 1 
                                                                           if x < 0.3
                                                                            else 0)

df_train['medical_specialty_2_high'] = df_train['medical_specialty_2'].apply(lambda x: 1 
                                                                           if x >= 0.12
                                                                            else 0)

df_train['medical_specialty_2_low'] = df_train['medical_specialty_2'].apply(lambda x: 1 
                                                                           if x < 0.1
                                                                            else 0)


#### Discharge_disposition

In [16]:
df_train['discharge_disposition'] = df_train['discharge_disposition'].fillna("Unknown")

In [17]:
# Same logic as before, calculate the proportion of each class within each 'discharge_disposition' and create columns accordingly
counts_per_discharge_disposition = df_train.groupby(['discharge_disposition', 'readmitted_multiclass']).size()
total_counts_per_discharge_disposition = df_train['discharge_disposition'].value_counts()

discharge_disposition_proportion = counts_per_discharge_disposition.div(total_counts_per_discharge_disposition, level='discharge_disposition').unstack(fill_value=0)
discharge_disposition_proportion

readmitted_multiclass,0,1,2
discharge_disposition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Admitted as an inpatient to this hospital,0.384615,0.076923,0.538462
Discharged to home,0.550099,0.357156,0.092744
Discharged/transferred to ICF,0.532399,0.343257,0.124343
Discharged/transferred to SNF,0.499387,0.354397,0.146217
Discharged/transferred to a federal health care facility.,1.0,0.0,0.0
Discharged/transferred to a long term care hospital.,0.592857,0.332143,0.075
Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.,0.5,0.34375,0.15625
Discharged/transferred to another rehab fac including rehab units of a hospital .,0.468055,0.25341,0.278536
Discharged/transferred to another short term hospital,0.524194,0.310484,0.165323
Discharged/transferred to another type of inpatient care institution,0.507299,0.283455,0.209246


In [18]:
df_train[['discharge_disposition', 'readmitted_multiclass']].groupby(['discharge_disposition', 'readmitted_multiclass']).value_counts().unstack()

readmitted_multiclass,0,1,2
discharge_disposition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Admitted as an inpatient to this hospital,5.0,1.0,7.0
Discharged to home,23245.0,15092.0,3919.0
Discharged/transferred to ICF,304.0,196.0,71.0
Discharged/transferred to SNF,4884.0,3466.0,1430.0
Discharged/transferred to a federal health care facility.,3.0,,
Discharged/transferred to a long term care hospital.,166.0,93.0,21.0
Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.,16.0,11.0,5.0
Discharged/transferred to another rehab fac including rehab units of a hospital .,652.0,353.0,388.0
Discharged/transferred to another short term hospital,780.0,462.0,246.0
Discharged/transferred to another type of inpatient care institution,417.0,233.0,172.0


In [19]:
discharge_disposition_proportion_reset = discharge_disposition_proportion.reset_index()

df_train = df_train.merge(discharge_disposition_proportion_reset, on='discharge_disposition', how='left')

df_train.rename(columns={0: 'discharge_disposition_0', 1: 'discharge_disposition_1', 2: 'discharge_disposition_2'}, inplace=True)

In [20]:
#create bins comparing the proportion of each readmitted multiclass and the proportion of each class in discharge disposition
df_train['discharge_disposition_0_high'] = df_train['discharge_disposition_0'].apply(lambda x: 1 
                                                                           if x >= 0.6
                                                                            else 0)

df_train['discharge_disposition_0_low'] = df_train['discharge_disposition_0'].apply(lambda x: 1 
                                                                           if x < 0.5
                                                                            else 0)

df_train['discharge_disposition_1_high'] = df_train['discharge_disposition_1'].apply(lambda x: 1 
                                                                           if x >= 0.4
                                                                            else 0)

df_train['discharge_disposition_1_low'] = df_train['discharge_disposition_1'].apply(lambda x: 1 
                                                                           if x < 0.3
                                                                            else 0)

df_train['discharge_disposition_2_high'] = df_train['discharge_disposition_2'].apply(lambda x: 1 
                                                                           if x >= 0.12
                                                                            else 0)

df_train['discharge_disposition_2_low'] = df_train['discharge_disposition_2'].apply(lambda x: 1 
                                                                           if x < 0.1
                                                                            else 0)



In [21]:
print(df_train['discharge_disposition_0_high'].value_counts())
print(df_train['discharge_disposition_0_low'].value_counts())
print(df_train['discharge_disposition_1_high'].value_counts())
print(df_train['discharge_disposition_1_low'].value_counts())
print(df_train['discharge_disposition_2_high'].value_counts())
print(df_train['discharge_disposition_2_low'].value_counts())



0    69564
1     1672
Name: discharge_disposition_0_high, dtype: int64
0    50394
1    20842
Name: discharge_disposition_0_low, dtype: int64
0    62222
1     9014
Name: discharge_disposition_1_high, dtype: int64
0    63923
1     7313
Name: discharge_disposition_1_low, dtype: int64
0    44217
1    27019
Name: discharge_disposition_2_high, dtype: int64
1    44217
0    27019
Name: discharge_disposition_2_low, dtype: int64


### Diagnosis variables 

Primary Diagnosis

In [22]:
#same logic as before, calculate the proportion of each class within each 'primary diagnosis' and create columns accordingly
counts_per_primary_diagnosis = df_train.groupby(['primary_diagnosis', 'readmitted_multiclass']).size()
total_counts_per_primary_diagnosis = df_train['primary_diagnosis'].value_counts()

primary_diagnosis_proportion = counts_per_primary_diagnosis.div(total_counts_per_primary_diagnosis, level='primary_diagnosis').unstack(fill_value=0)
primary_diagnosis_proportion

readmitted_multiclass,0,1,2
primary_diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,1.000000,0.000000,0.000000
11,0.857143,0.142857,0.000000
110,0.000000,1.000000,0.000000
112,0.456140,0.350877,0.192982
114,0.000000,1.000000,0.000000
...,...,...,...
V56,0.500000,0.416667,0.083333
V57,0.512167,0.351101,0.136732
V58,0.286667,0.313333,0.400000
V63,0.800000,0.200000,0.000000


In [23]:
primary_diagnosis_proportion_reset = primary_diagnosis_proportion.reset_index()

df_train = df_train.merge(primary_diagnosis_proportion_reset, on='primary_diagnosis', how='left')

df_train.rename(columns={0: 'primary_diagnosis_0', 1: 'primary_diagnosis_1', 2: 'primary_diagnosis_2'}, inplace=True)

In [24]:
#Create bins comparing the proportion of each readmitted multiclass and the proportion of each class in primary diagnosis
df_train['primary_diagnosis_0_high'] = df_train['primary_diagnosis_0'].apply(lambda x: 1 
                                                                           if x >= 0.6
                                                                            else 0)

df_train['primary_diagnosis_0_low'] = df_train['primary_diagnosis_0'].apply(lambda x: 1 
                                                                           if x < 0.5
                                                                            else 0)

df_train['primary_diagnosis_1_high'] = df_train['primary_diagnosis_1'].apply(lambda x: 1 
                                                                           if x >= 0.4
                                                                            else 0)

df_train['primary_diagnosis_1_low'] = df_train['primary_diagnosis_1'].apply(lambda x: 1 
                                                                           if x < 0.3
                                                                            else 0)

df_train['primary_diagnosis_2_high'] = df_train['primary_diagnosis_2'].apply(lambda x: 1 
                                                                           if x >= 0.12
                                                                            else 0)

df_train['primary_diagnosis_2_low'] = df_train['primary_diagnosis_2'].apply(lambda x: 1 
                                                                           if x < 0.1
                                                                            else 0)

Secondary Diagnosis

In [25]:
# Same logic as before, calculate the proportion of each class within each 'discharge_disposition' and create columns accordingly
counts_per_secondary_diagnosis = df_train.groupby(['secondary_diagnosis', 'readmitted_multiclass']).size()
total_counts_per_secondary_diagnosis = df_train['secondary_diagnosis'].value_counts()

secondary_diagnosis_proportion = counts_per_secondary_diagnosis.div(total_counts_per_secondary_diagnosis, level='secondary_diagnosis').unstack(fill_value=0)
secondary_diagnosis_proportion_reset = secondary_diagnosis_proportion.reset_index()
secondary_diagnosis_proportion

readmitted_multiclass,0,1,2
secondary_diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,0.333333,0.333333,0.333333
110,0.428571,0.428571,0.142857
111,1.000000,0.000000,0.000000
112,0.521429,0.342857,0.135714
114,0.000000,0.000000,1.000000
...,...,...,...
V69,1.000000,0.000000,0.000000
V70,0.600000,0.400000,0.000000
V72,0.800000,0.200000,0.000000
V85,0.628319,0.247788,0.123894


In [26]:
df_train = df_train.merge(secondary_diagnosis_proportion_reset, on='secondary_diagnosis', how='left')

df_train.rename(columns={0: 'secondary_diagnosis_0', 1: 'secondary_diagnosis_1', 2: 'secondary_diagnosis_2'}, inplace=True)

In [27]:
#Create bins comparing the proportion of each readmitted multiclass and the proportion of each class in primary diagnosis
df_train['secondary_diagnosis_0_high'] = df_train['secondary_diagnosis_0'].apply(lambda x: 1 
                                                                           if x >= 0.6
                                                                            else 0)

df_train['secondary_diagnosis_0_low'] = df_train['secondary_diagnosis_0'].apply(lambda x: 1 
                                                                           if x < 0.5
                                                                            else 0)

df_train['secondary_diagnosis_1_high'] = df_train['secondary_diagnosis_1'].apply(lambda x: 1 
                                                                           if x >= 0.4
                                                                            else 0)

df_train['secondary_diagnosis_1_low'] = df_train['secondary_diagnosis_1'].apply(lambda x: 1 
                                                                           if x < 0.3
                                                                            else 0)

df_train['secondary_diagnosis_2_high'] = df_train['secondary_diagnosis_2'].apply(lambda x: 1 
                                                                           if x >= 0.12
                                                                            else 0)

df_train['secondary_diagnosis_2_low'] = df_train['secondary_diagnosis_2'].apply(lambda x: 1 
                                                                           if x < 0.1
                                                                            else 0)


Additional Diagnosis

In [28]:
# Same logic as before, calculate the proportion of each class within each 'additional_diagnosis' and create columns accordingly
counts_per_additional_diagnosis = df_train.groupby(['additional_diagnosis', 'readmitted_multiclass']).size()
total_counts_per_additional_diagnosis = df_train['additional_diagnosis'].value_counts()

additional_diagnosis_proportion = counts_per_additional_diagnosis.div(total_counts_per_additional_diagnosis, level='additional_diagnosis').unstack(fill_value=0)
additional_diagnosis_proportion

readmitted_multiclass,0,1,2
additional_diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,0.500000,0.000000,0.500000
110,0.666667,0.250000,0.083333
112,0.539007,0.361702,0.099291
115,1.000000,0.000000,0.000000
117,0.200000,0.400000,0.400000
...,...,...,...
V66,0.666667,0.250000,0.083333
V70,1.000000,0.000000,0.000000
V72,0.666667,0.166667,0.166667
V85,0.666667,0.231884,0.101449


In [29]:

additional_diagnosis_proportion_reset = additional_diagnosis_proportion.reset_index()

df_train = df_train.merge(additional_diagnosis_proportion_reset, on='additional_diagnosis', how='left')

df_train.rename(columns={0: 'additional_diagnosis_0', 1: 'additional_diagnosis_1', 2: 'additional_diagnosis_2'}, inplace=True)

In [30]:
#Create bins comparing the proportion of each readmitted multiclass and the proportion of each class in primary diagnosis
df_train['additional_diagnosis_0_high'] = df_train['additional_diagnosis_0'].apply(lambda x: 1 
                                                                           if x >= 0.6
                                                                            else 0)

df_train['additional_diagnosis_0_low'] = df_train['additional_diagnosis_0'].apply(lambda x: 1 
                                                                           if x < 0.5
                                                                            else 0)

df_train['additional_diagnosis_1_high'] = df_train['additional_diagnosis_1'].apply(lambda x: 1 
                                                                           if x >= 0.4
                                                                            else 0)

df_train['additional_diagnosis_1_low'] = df_train['additional_diagnosis_1'].apply(lambda x: 1 
                                                                           if x < 0.3
                                                                            else 0)

df_train['additional_diagnosis_2_high'] = df_train['additional_diagnosis_2'].apply(lambda x: 1 
                                                                           if x >= 0.12
                                                                            else 0)

df_train['additional_diagnosis_2_low'] = df_train['additional_diagnosis_2'].apply(lambda x: 1 
                                                                           if x < 0.1
                                                                            else 0)

## Model and feature selection

### Set train and test data

In [None]:

#replace columns names 

categorical_columns = ['race',
       'payer_code',  'admission_type',
       'medical_specialty',  'discharge_disposition',
       'admission_source', 'primary_diagnosis',
       'secondary_diagnosis', 'additional_diagnosis', 
       'change_in_meds_during_hospitalization', 'prescribed_diabetes_meds',
       # 'readmitted_binary', 'readmitted_multiclass',
       'has_weight', 'race_Asian', 'race_Caucasian', 'race_Hispanic',
       'race_Other', 'race_nan', 'payer_code_HM', 'payer_code_MC',
       'payer_code_MD', 'payer_code_No provider', 'payer_code_Other provider',
       'payer_code_SP', 
       'more_than_one_lab_test', 'high_admission_source',
       'medium_admission_source', 'low_admission_source',
       'admission_type_big_share', 'admission_type_big_share_Elective',
       'admission_type_big_share_Emergency', 'admission_type_big_share_Urgent',
       'no_medical_specialty', 'high_medical_specialty',
       'medium_medical_specialty', 'low_medical_specialty',
       'discharged_home_hospice', 'expired', 'very_high_discharge_disposition', 'high_discharge_disposition',
       'medium_discharge_disposition', 'primary_diagnosis_category',
       'secondary_diagnosis_category', 'additional_diagnosis_category',
       'high_primary_diagnosis_category', 'medium_primary_diagnosis_category',
       'low_primary_diagnosis_category', 'high_secondary_diagnosis_category',
       'medium_secondary_diagnosis_category',
       'low_secondary_diagnosis_category',
       'high_additional_diagnosis_category',
       'medium_additional_diagnosis_category',
       'low_additional_diagnosis_category', 'diabetes_diagnosis',
       'diabetes_diagnosis_4_digits', 'diabetes_type',
       'diabetes_severity_group', 'diabetes_severity_group_Mild',
       'diabetes_severity_group_Moderate', 'diabetes_severity_group_Severe',
       'diabetes_type_I - controlled', 'diabetes_type_I - uncontrolled',
       'diabetes_type_II - controlled', 'diabetes_type_II - uncontrolled',
       'glucose_test_performed', 'a1c_test_performed', 
       'metformin', 'rosiglitazone', 'glyburide', 'insulin', 'glipizide',
       'repaglinide', 'glimepiride', 'pioglitazone',
        'high_primary_diagnosis',
       'medium_primary_diagnosis', 'low_primary_diagnosis',
       'high_secondary_diagnosis', 'medium_secondary_diagnosis',
       'low_secondary_diagnosis', 'high_additional_diagnosis',
       'medium_additional_diagnosis', 'low_additional_diagnosis']

df_train[categorical_columns] = df_train[categorical_columns].astype('category')

In [61]:
columns_to_use = ['encounter_id', 'patient_id', 
                  
                  'gender', 'age', 'has_weight',
                  
                  'outpatient_visits_in_previous_year',
                  'emergency_visits_in_previous_year',
                  'inpatient_visits_in_previous_year', 
                  'total_visits', 'max_visits_of_one_type',
                  
                  'average_pulse_bpm', 
                  'length_of_stay_in_hospital', 

                  'medical_specialty_0_high', 'medical_specialty_0_low', 'medical_specialty_1_high', 
                  'medical_specialty_1_low', 'medical_specialty_2_high', 'medical_specialty_2_low',

                  'admission_source_0_high',	'admission_source_0_low',	'admission_source_1_high',	'admission_source_1_low',
                  'admission_source_2_high',	'admission_source_2_low',

                  'discharge_disposition_0_high', 'discharge_disposition_0_low',
                  'discharge_disposition_1_high', 'discharge_disposition_1_low',
                  'discharge_disposition_2_high', 'discharge_disposition_2_low',
                  
                  'primary_diagnosis_2_low', 'primary_diagnosis_2_high','primary_diagnosis_1_low',
                  'primary_diagnosis_1_high','primary_diagnosis_0_low','primary_diagnosis_0_high',

                  'secondary_diagnosis_2_low', 'secondary_diagnosis_2_high','secondary_diagnosis_1_low',
                  'secondary_diagnosis_1_high','secondary_diagnosis_0_low','secondary_diagnosis_0_high',                   
                   
                  'additional_diagnosis_0_high', 'additional_diagnosis_0_low',
                  'additional_diagnosis_1_high', 'additional_diagnosis_1_low',
                  'additional_diagnosis_2_high', 'additional_diagnosis_2_low',
                                
                  'number_lab_tests',
                  'non_lab_procedures', 
                  'number_of_medications', 
                  'number_diagnoses',
                  
                  'glucose_test_performed', 'glucose_test_result',
                   'a1c_test_result',
        
                  'change_in_meds_during_hospitalization', 
                  'prescribed_diabetes_meds', 
                  
                  'race_Asian', 'race_Caucasian', 'race_Hispanic',
                  'race_Other', 'race_nan', 
                  
                  'payer_code_No provider',
                  'payer_code_Other provider',        
                   
                  'diabetes_severity_group_Mild', 'diabetes_severity_group_Moderate', 'diabetes_severity_group_Severe',
                  
                  'diabetes_type_I - controlled', 'diabetes_type_I - uncontrolled',
                  'diabetes_type_II - controlled', 'diabetes_type_II - uncontrolled',
                  
                  'glimepiride', 'pioglitazone', 'repaglinide', 'metformin', 'glyburide',
                  'rosiglitazone', 'glipizide', 'insulin']
                 

In [62]:
data = df_train[columns_to_use]
target = df_train['readmitted_multiclass']
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

### Functions for Model

In [40]:
def target_encoder(fit_data, transform_data, target, features_for_encoding, smoothing=100):
    target_encoder = ce.TargetEncoder(cols=features_for_encoding, smoothing=smoothing)  
    target_encoder.fit(fit_data[features_for_encoding], fit_data[target].astype(int))
    encoded_df = target_encoder.transform(transform_data[features_for_encoding])
    features_for_encoding_pass = [i + '_target' for i in features_for_encoding]
    transform_data[features_for_encoding_pass] = np.array(encoded_df)
    encoded_df = transform_data.drop(columns = features_for_encoding)
    # encoded_df =  pd.concat((transform_data[transform_data.columns.difference(features_for_encoding)],
    #                     pd.DataFrame(np.array(encoded_df), columns = features_for_encoding_pass)), axis=1)
    return encoded_df, features_for_encoding_pass

In [41]:
def scaler(fit_data, transform_data, features_for_scaling, scaling_type = 'minmax'):
    if scaling_type == 'minmax':
        sc = MinMaxScaler()
        sc.fit(fit_data[features_for_scaling])
        scaled_df = sc.transform(transform_data[features_for_scaling])
        scaled_df = pd.concat((transform_data[transform_data.columns.difference(features_for_scaling)],
                        pd.DataFrame(scaled_df, columns = features_for_scaling)), axis=1) 
        return scaled_df
    if scaling_type == 'standard':
        sc = StandardScaler()
        sc.fit(fit_data[features_for_scaling])
        scaled_df = sc.transform(transform_data[features_for_scaling])
        scaled_df = pd.concat((transform_data[transform_data.columns.difference(features_for_scaling)],
                        pd.DataFrame(scaled_df, columns = features_for_scaling)), axis=1)
        return scaled_df

In [42]:
def patient_features(df_train, df_test, multiclass = False):
    df_all = pd.concat([df_train, df_test])

    list_for_scaling = []
    
    # total visits of one patient in dataset, excluding current visit
    patient_total_visits = df_train[['patient_id', 'encounter_id']].groupby('patient_id').count().rename(columns={'encounter_id': 'patient_total_visits'}).reset_index()
    patient_total_visits['patient_total_visits'] = (patient_total_visits['patient_total_visits'] - 1).astype('int')
    df_train = pd.merge(df_train, patient_total_visits, how = 'left', on = 'patient_id')
    df_train['patient_total_visits'].fillna(0, inplace = True)

    patient_total_visits = df_all[['patient_id', 'encounter_id']].groupby('patient_id').count().rename(columns={'encounter_id': 'patient_total_visits'}).reset_index()
    patient_total_visits['patient_total_visits'] = (patient_total_visits['patient_total_visits'] - 1).astype('int')
    df_test = pd.merge(df_test, patient_total_visits, how = 'left', on = 'patient_id')
    df_test['patient_total_visits'].fillna(0, inplace = True)

    list_for_scaling.append('patient_total_visits')

    if multiclass == False:
        # check if we know that this person was readmitted on any other visit we know and calculate number of readmissions
        was_readmitted_encounter = df_train[['patient_id', 
                                   'encounter_id',
                                   'readmitted_binary']].groupby(['patient_id','encounter_id']).max(numeric_only=True).rename(columns={'readmitted_binary': 'was_readmitted_encounter'}).reset_index()
        was_readmitted_patient = df_train[['patient_id', 
                                       'readmitted_binary']].groupby(['patient_id']).sum(numeric_only=True).rename(columns={'readmitted_binary': 'was_readmitted_patient'}).reset_index()
        was_readmitted = pd.merge(was_readmitted_encounter, was_readmitted_patient, on = 'patient_id')
        was_readmitted['patient_num_readmitted'] = (was_readmitted['was_readmitted_patient'] - was_readmitted['was_readmitted_encounter']).astype('int')
        was_readmitted['patient_was_readmitted'] = was_readmitted['patient_num_readmitted'].apply(lambda x: 1 if x > 0 else 0).astype('category')
        df_train = pd.merge(df_train, was_readmitted[['patient_num_readmitted', 'patient_was_readmitted', 'encounter_id']], how = 'left', on = 'encounter_id')
        df_train[['patient_num_readmitted', 'patient_was_readmitted']] = df_train[['patient_num_readmitted', 'patient_was_readmitted']].fillna(0)
    
        df_test = pd.merge(df_test, was_readmitted[['patient_num_readmitted', 'patient_was_readmitted', 'encounter_id']], how = 'left', on = 'encounter_id')
        df_test[['patient_num_readmitted', 'patient_was_readmitted']] = df_test[['patient_num_readmitted', 'patient_was_readmitted']].fillna(0).astype(int)
        list_for_scaling.append('patient_num_readmitted')

    return df_train, df_test, list_for_scaling

In [43]:
def upsampler(data, 
              target,  
              upsample_type = 'simple', 
              upsample_size = 1):
    majority_class_label = data[target].value_counts().idxmax()
    minority_class_labels = data[target].value_counts().index.drop(majority_class_label)
    balanced_data = pd.DataFrame()
    if upsample_type == 'simple':
        for label in minority_class_labels:
            if len(data[data[target] == majority_class_label])*upsample_size > len(data[data[target] == label]):
                minority_upsampled = resample(data[data[target] == label], replace=True, n_samples=int(len(data[data[target] == majority_class_label])*upsample_size), 
                                                  random_state = random_state)
                balanced_data = pd.concat([balanced_data, minority_upsampled])
            else:
                balanced_data = pd.concat([balanced_data, data[data[target] == label]])
        balanced_data = pd.concat([balanced_data, data[data[target] == majority_class_label]])
    if upsample_type == 'SMOTE':
        X = data.drop(columns = target)
        categorical_features = X.select_dtypes(include='category').columns
        cat_ind = [X.columns.get_loc(col) for col in categorical_features]
        smote_nc = SMOTENC(categorical_features=cat_ind, random_state=random_state)
        X_resampled, y_resampled = smote_nc.fit_resample(X, data[target])  
        balanced_data = pd.concat([X_resampled, y_resampled], axis = 1)
    return balanced_data

In [70]:
def cross_validation(data, 
                     target, 
                     model, 
                     params, 
                     multiclass = False,
                     
                     scaling = False,
                     features_for_scaling_minmax = [],
                     features_for_scaling_standard = [],

                     target_encoding = False,
                     features_for_encoding = [],
                     smoothing = 100,
                     
                     upsample = False, 
                     upsample_size = 1,
                     upsample_type = 'simple', 
                     cv=5
                     ):
    # Defining some lists to collect data
    feature_imp = []
    f1 = []
    confusion_matrix_list = []
    precision_score_list = []
    recall_score_list = []
    roc_auc_score_list = []

    # A bit of resampling just to mix data
    data = data.sample(frac=1, random_state=random_state)
    data[target] = data[target].astype(int)
    

    # create stratified folds
    skf = StratifiedKFold(n_splits=cv, shuffle = True, random_state = random_state)
    split = list(skf.split(data.drop(columns = target), data[target]))
    all_train_index = [i[0] for i in split]
    all_val_index = [i[1] for i in split]
    
    for i in tqdm(range(cv)):
        val_index = all_val_index[i]
        train_index = all_train_index[i]
        train_data, val_data = data.loc[train_index], data.loc[val_index]

        #Target encoding
        if target_encoding == True:
                val_data = target_encoder(train_data, val_data, features_for_encoding = features_for_encoding, target = target, smoothing = smoothing)[0]
                train_data, features_for_encoding_pass = target_encoder(train_data, train_data, features_for_encoding = features_for_encoding, target = target, smoothing = smoothing)
                features_for_scaling_minmax_full = features_for_scaling_minmax + list(features_for_encoding_pass)
        else:
            features_for_scaling_minmax_full = features_for_scaling_minmax

        #creating features on patient level that we will use
        train_data, val_data, pat_features_scale = patient_features(train_data, val_data, multiclass = multiclass)
        features_for_scaling_minmax_full = features_for_scaling_minmax_full + pat_features_scale
 
        # Scaling features
        if scaling == True:
            if len(features_for_scaling_minmax) > 0:
                val_data = scaler(train_data, val_data, features_for_scaling_minmax_full, scaling_type = 'minmax')
                train_data = scaler(train_data, train_data, features_for_scaling_minmax_full, scaling_type = 'minmax')
            if len(features_for_scaling_standard) > 0:
                val_data = scaler(train_data, val_data, features_for_scaling_standard, scaling_type = 'standard')
                train_data = scaler(train_data, train_data, features_for_scaling_standard, scaling_type = 'standard')
                
        # Upsampling only train data 
        if upsample == True:
            train_data = upsampler(train_data, target, upsample_type = upsample_type,  upsample_size = upsample_size)

        # Defining train and val datasets
        X_train = train_data.drop(target, axis=1).drop(columns = ['encounter_id', 'patient_id'])
        y_train = train_data[target]
        
        X_val = val_data.drop(target, axis=1).drop(columns = ['encounter_id', 'patient_id'])
        y_val = val_data[target]


        # Fit and predict
        model.set_params(**params)
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_val = model.predict(X_val)

        #Feature importances if our model can do this
        try:
            try:
                feature_imp.append(model.feature_importances_)
            except:
                pass
            try:
                feature_imp.append(model.coef_)
            except:
                pass
        except:
            pass

        # Calculating f1 scores and other metrics
        if multiclass == False:
            f1.append([f1_score(y_train, y_pred_train), f1_score(y_val, y_pred_val)])
            recall_score_list.append([recall_score(y_train, y_pred_train), recall_score(y_val, y_pred_val)])
            precision_score_list.append([precision_score(y_train, y_pred_train), precision_score(y_val, y_pred_val)])
            roc_auc_score_list.append([roc_auc_score(y_train, y_pred_train), roc_auc_score(y_val, y_pred_val)])
        if multiclass == True:
            f1.append([f1_score( y_train, y_pred_train, average = 'weighted'), f1_score(y_val, y_pred_val, average = 'weighted')])
            recall_score_list.append([recall_score( y_train, y_pred_train, average = 'weighted'), recall_score(y_val, y_pred_val, average = 'weighted')])
            precision_score_list.append([precision_score(y_train, y_pred_train, average = 'weighted'), precision_score(y_val, y_pred_val, average = 'weighted')])
            # roc_auc_score_list.append([roc_auc_score( y_train, y_pred_train.reshape(-1, 1), multi_class='ovr', average = 'micro'), roc_auc_score(y_val, y_pred_val.reshape(-1, 1), multi_class='ovr', average = 'micro')])

        
        confusion_matrix_list.append([confusion_matrix(y_train, y_pred_train), confusion_matrix(y_val, y_pred_val)])

    # Printing results
    print(model)
    print('F1 train: {:.3f}, F1_val: {:.3f}'.format(np.array(f1).mean(axis = 0)[0],
                                                   np.array(f1).mean(axis = 0)[1]))
    return [model.get_params(), 
            {'f1' : np.array(f1).mean(axis = 0), 
             'recall_score' : np.array(recall_score_list).mean(axis = 0),
             'precision_score' : np.array(precision_score_list).mean(axis = 0),
             'roc_auc_score' : np.array(roc_auc_score_list).mean(axis = 0)
            }, 
            {'f1' : f1, 
             'recall_score' : recall_score_list,
             'precision_score' : precision_score_list,
             'roc_auc_score' : roc_auc_score_list,
             'confusion_matrix': confusion_matrix_list
            },
            X_train.columns, 
            np.median(np.array(feature_imp), axis = 0),
            ]

### Important functions

### Model selection

In [65]:
features_for_scaling_minmax = ['outpatient_visits_in_previous_year',
       'emergency_visits_in_previous_year',
       'inpatient_visits_in_previous_year', 'total_visits',
       'max_visits_of_one_type', 'average_pulse_bpm','length_of_stay_in_hospital',
       'non_lab_procedures', 'number_diagnoses', 'glucose_test_result',
       'a1c_test_result']
features_for_scaling_standard = ['number_lab_tests', 'number_of_medications']

data = pd.concat([X_train, y_train], axis = 1)
data.reset_index(inplace = True)

target = 'readmitted_multiclass'

**Random Forest**

In [71]:
model = RandomForestClassifier(random_state = random_state)
params = {
    'n_estimators': np.random.randint(10, 20, 10),
    'max_depth': np.random.randint(1, 5, 10),
    'min_samples_split': np.random.randint(2, 10, 10),
    'min_samples_leaf': np.random.randint(5, 20, 10),
    # 'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', 'balanced_subsample'],
    'max_features': ['sqrt', 'log2', None],
    # 'max_leaf_nodes': [None, 10, 20]
}

result = []
for i in tqdm(range(20)):
    random_params = {}
    for key, values in params.items():
        random_params[key] = random.choice(values)
    result.append(cross_validation(data, 
                     target, 
                     model,
                     random_params,  
                     multiclass = True,
                     scaling = True,
                     features_for_scaling_minmax = features_for_scaling_minmax,
                     features_for_scaling_standard = features_for_scaling_standard,
                     upsample = False, 
                     upsample_size = 1,
                     cv=5))

  0%|                                                    | 0/20 [00:00<?, ?it/s]
  0%|                                                     | 0/5 [00:00<?, ?it/s][A
 20%|█████████                                    | 1/5 [00:01<00:05,  1.49s/it][A
 40%|██████████████████                           | 2/5 [00:02<00:04,  1.46s/it][A
 60%|███████████████████████████                  | 3/5 [00:04<00:02,  1.47s/it][A
 80%|████████████████████████████████████         | 4/5 [00:05<00:01,  1.45s/it][A
100%|█████████████████████████████████████████████| 5/5 [00:07<00:00,  1.46s/it][A
  'roc_auc_score' : np.array(roc_auc_score_list).mean(axis = 0)
  ret = ret.dtype.type(ret / rcount)
  5%|██▏                                         | 1/20 [00:07<02:19,  7.35s/it]

RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy',
                       max_depth=4, max_features=None, min_samples_leaf=18,
                       min_samples_split=6, n_estimators=16, random_state=42)
F1 train: 0.527, F1_val: 0.537



  0%|                                                     | 0/5 [00:00<?, ?it/s][A
 20%|█████████                                    | 1/5 [00:01<00:05,  1.47s/it][A
 40%|██████████████████                           | 2/5 [00:02<00:04,  1.47s/it][A
 60%|███████████████████████████                  | 3/5 [00:04<00:02,  1.48s/it][A
 80%|████████████████████████████████████         | 4/5 [00:07<00:01,  1.79s/it][A
  5%|██▏                                         | 1/20 [00:14<04:36, 14.53s/it]


KeyboardInterrupt: 

In [None]:
model = RandomForestClassifier(random_state = random_state)
params = {
    'n_estimators': np.random.randint(100, 200, 10),
    'max_depth': np.random.randint(10, 20, 10),
    'min_samples_split': np.random.randint(2, 10, 10),
    'min_samples_leaf': np.random.randint(5, 20, 10),
    # 'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', 'balanced_subsample'],
    'max_features': ['sqrt', 'log2', None]
    # 'max_leaf_nodes': [None, 10, 20]
}

model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = model.predict(X_test)

# Calculate accuracy as an example metric
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print(f1_rf)

0.5531480457912444


**Logistic Regression**

In [None]:
lr_classifier = LogisticRegression(max_iter=900)
lr_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_lr = lr_classifier.predict(X_test)

# Calculate accuracy as an example metric
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

print(f1_lr)

0.3786433311008068


**MLP CLassifier**

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create and train a Multi-layer Perceptron (MLP) classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_mlp = mlp_classifier.predict(X_test)

f1_mlp = f1_score(y_test, y_pred_mlp, average='weighted')

print(f1_mlp)


0.2992154477669934


**Decision Trees**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

y_pred_dt = dt_classifier.predict(X_test)

f1_dt = f1_score(y_test, y_pred_dt, average='weighted')

print(f1_dt)

0.49647508418342984
