## ML GROUP ASSIGNMENT:
### Predicting readmissions by leveraging "diabetic_data.csv"
MBD Section 2, Group 6

Group members & notebook credit:
- Ignacio Ferro
- Jose Carranque
- Maica Muñoz
- Maria Jose Perez
- Mohammed Alotaibi
- Rodrigo Reyes Sanchez


## Step 0 - Loading key libraries & dataset

In [1]:
#Loading libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Loading Dataset
df = pd.read_csv("diabetic_readmission_data.csv")
df.shape

(101766, 50)

## Step 1 - Exploratory Data Analysis (EDA)

In [3]:
df.isna().sum()

#Identify number of missing values (represented as '?' sign) for each feature
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())

race 2273
gender 0
age 0
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0


In [2]:
# Identify number of null values for each feature
df.isna().sum()

#Identify number of missing values (represented as '?' sign) for each feature
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())

# Analyze the distribution of values for Numerical features (identify outliers)
df['time_in_hospital'].describe()
df['num_lab_procedures'].describe()
df['num_medications'].describe()
df['number_outpatient'].describe()
df['number_emergency'].describe()
df['number_inpatient'].describe()
df['number_diagnoses'].describe()

# Analyze the distribution of values for Categorical features (identify outliers)
df['race'].value_counts()
df['gender'].value_counts()
df['age'].value_counts()
df['weight'].value_counts()
df['num_procedures'].value_counts()
df['admission_type_id'].value_counts()
df['discharge_disposition_id'].value_counts()
df['admission_source_id'].value_counts()
df['payer_code'].value_counts()
df['medical_specialty'].value_counts()
df['diag_1'].value_counts()
df['diag_2'].value_counts()
df['diag_3'].value_counts()
df['max_glu_serum'].value_counts()
df['A1Cresult'].value_counts()
df['metformin'].value_counts()
df['repaglinide'].value_counts()
df['nateglinide'].value_counts()
df['chlorpropamide'].value_counts()
df['glimepiride'].value_counts()
df['acetohexamide'].value_counts()
df['glipizide'].value_counts()
df['glyburide'].value_counts()
df['tolbutamide'].value_counts()
df['pioglitazone'].value_counts()
df['rosiglitazone'].value_counts()
df['acarbose'].value_counts()
df['miglitol'].value_counts()
df['troglitazone'].value_counts()
df['tolazamide'].value_counts()
df['examide'].value_counts()
df['citoglipton'].value_counts()
df['insulin'].value_counts()
df['glyburide-metformin'].value_counts()
df['glipizide-metformin'].value_counts()
df['glimepiride-pioglitazone'].value_counts()
df['metformin-rosiglitazone'].value_counts()
df['metformin-pioglitazone'].value_counts()
df['change'].value_counts()
df['diabetesMed'].value_counts()
df['num_procedures'].value_counts()
df['readmitted'].value_counts()


# ---- CODE TO PLOT HISTOGRAMS ----
# plt.figure(figsize=(8, 6))
# plt.hist(df['num_medications'], color='skyblue', edgecolor='black')
# plt.xlabel('Number of Medications')
# plt.ylabel('Frequency')
# plt.title('Distribution of Number of Medications')
# plt.grid(True)
# plt.show()


race 2273
gender 0
age 0
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0


readmitted
NO     54864
>30    35545
<30    11357
Name: count, dtype: int64

## Step 2 - Data Cleaning & Preparation

In [3]:
# ------ CREATE NEW COLUMNS ------

# Create a new column called "encounter_number".
# It counts the number of encounters that each unique "patient_nbr" did until that moment (ordinal encoding)
df['encounter_number'] = df.groupby('patient_nbr').cumcount() + 1

In [4]:
# ------ SIMPLIFICATION OF diag_1, diag_2 and diag_3 ------
# Source & credit: https://www.kaggle.com/code/iabhishekofficial/prediction-on-hospital-readmission?scriptVersionId=14883095&cellId=35

# Categorization of diagnoses: The dataset contained up to three diagnoses for a given patient (primary, secondary and additional).
# However, each of these had 700–900 unique ICD codes and it is extremely difficult to include them in the model and interpret meaningfully.
# Therefore, we collapsed these diagnosis codes into 9 disease categories in an almost similar fashion to that done in the original
# publication using this dataset.
# These 9 categories include Circulatory, Respiratory, Digestive, Diabetes, Injury, Musculoskeletal, Genitourinary, Neoplasms, and Others.

# Although we did this for primary, secondary and additional diagnoses, we eventually decided to use only the primary diagnosis in our model.
# Doing this in python was slightly cumbersome because, well, we are mapping the disease codes to certain category names.
# Below code should demonstrate this easily.


# Creating additional columns for diagnosis
df['level1_diag1'] = df['diag_1']
df['level2_diag1'] = df['diag_1']
df['level1_diag2'] = df['diag_2']
df['level2_diag2'] = df['diag_2']
df['level1_diag3'] = df['diag_3']
df['level2_diag3'] = df['diag_3']

df.loc[df['diag_1'].str.contains('V'), ['level1_diag1', 'level2_diag1']] = 0
df.loc[df['diag_1'].str.contains('E'), ['level1_diag1', 'level2_diag1']] = 0
df.loc[df['diag_2'].str.contains('V'), ['level1_diag2', 'level2_diag2']] = 0
df.loc[df['diag_2'].str.contains('E'), ['level1_diag2', 'level2_diag2']] = 0
df.loc[df['diag_3'].str.contains('V'), ['level1_diag3', 'level2_diag3']] = 0
df.loc[df['diag_3'].str.contains('E'), ['level1_diag3', 'level2_diag3']] = 0
df['level1_diag1'] = df['level1_diag1'].replace('?', -1)
df['level2_diag1'] = df['level2_diag1'].replace('?', -1)
df['level1_diag2'] = df['level1_diag2'].replace('?', -1)
df['level2_diag2'] = df['level2_diag2'].replace('?', -1)
df['level1_diag3'] = df['level1_diag3'].replace('?', -1)
df['level2_diag3'] = df['level2_diag3'].replace('?', -1)

df['level1_diag1'] = df['level1_diag1'].astype(float)
df['level2_diag1'] = df['level2_diag1'].astype(float)
df['level1_diag2'] = df['level1_diag2'].astype(float)
df['level2_diag2'] = df['level2_diag2'].astype(float)
df['level1_diag3'] = df['level1_diag3'].astype(float)
df['level2_diag3'] = df['level2_diag3'].astype(float)

for index, row in df.iterrows():
    if (row['level1_diag1'] >= 390 and row['level1_diag1'] < 460) or (np.floor(row['level1_diag1']) == 785):
        df.loc[index, 'level1_diag1'] = 1
    elif (row['level1_diag1'] >= 460 and row['level1_diag1'] < 520) or (np.floor(row['level1_diag1']) == 786):
        df.loc[index, 'level1_diag1'] = 2
    elif (row['level1_diag1'] >= 520 and row['level1_diag1'] < 580) or (np.floor(row['level1_diag1']) == 787):
        df.loc[index, 'level1_diag1'] = 3
    elif (np.floor(row['level1_diag1']) == 250):
        df.loc[index, 'level1_diag1'] = 4
    elif (row['level1_diag1'] >= 800 and row['level1_diag1'] < 1000):
        df.loc[index, 'level1_diag1'] = 5
    elif (row['level1_diag1'] >= 710 and row['level1_diag1'] < 740):
        df.loc[index, 'level1_diag1'] = 6
    elif (row['level1_diag1'] >= 580 and row['level1_diag1'] < 630) or (np.floor(row['level1_diag1']) == 788):
        df.loc[index, 'level1_diag1'] = 7
    elif (row['level1_diag1'] >= 140 and row['level1_diag1'] < 240):
        df.loc[index, 'level1_diag1'] = 8
    else:
        df.loc[index, 'level1_diag1'] = 0
        
    if (row['level1_diag2'] >= 390 and row['level1_diag2'] < 460) or (np.floor(row['level1_diag2']) == 785):
        df.loc[index, 'level1_diag2'] = 1
    elif (row['level1_diag2'] >= 460 and row['level1_diag2'] < 520) or (np.floor(row['level1_diag2']) == 786):
        df.loc[index, 'level1_diag2'] = 2
    elif (row['level1_diag2'] >= 520 and row['level1_diag2'] < 580) or (np.floor(row['level1_diag2']) == 787):
        df.loc[index, 'level1_diag2'] = 3
    elif (np.floor(row['level1_diag2']) == 250):
        df.loc[index, 'level1_diag2'] = 4
    elif (row['level1_diag2'] >= 800 and row['level1_diag2'] < 1000):
        df.loc[index, 'level1_diag2'] = 5
    elif (row['level1_diag2'] >= 710 and row['level1_diag2'] < 740):
        df.loc[index, 'level1_diag2'] = 6
    elif (row['level1_diag2'] >= 580 and row['level1_diag2'] < 630) or (np.floor(row['level1_diag2']) == 788):
        df.loc[index, 'level1_diag2'] = 7
    elif (row['level1_diag2'] >= 140 and row['level1_diag2'] < 240):
        df.loc[index, 'level1_diag2'] = 8
    else:
        df.loc[index, 'level1_diag2'] = 0
    
    if (row['level1_diag3'] >= 390 and row['level1_diag3'] < 460) or (np.floor(row['level1_diag3']) == 785):
        df.loc[index, 'level1_diag3'] = 1
    elif (row['level1_diag3'] >= 460 and row['level1_diag3'] < 520) or (np.floor(row['level1_diag3']) == 786):
        df.loc[index, 'level1_diag3'] = 2
    elif (row['level1_diag3'] >= 520 and row['level1_diag3'] < 580) or (np.floor(row['level1_diag3']) == 787):
        df.loc[index, 'level1_diag3'] = 3
    elif (np.floor(row['level1_diag3']) == 250):
        df.loc[index, 'level1_diag3'] = 4
    elif (row['level1_diag3'] >= 800 and row['level1_diag3'] < 1000):
        df.loc[index, 'level1_diag3'] = 5
    elif (row['level1_diag3'] >= 710 and row['level1_diag3'] < 740):
        df.loc[index, 'level1_diag3'] = 6
    elif (row['level1_diag3'] >= 580 and row['level1_diag3'] < 630) or (np.floor(row['level1_diag3']) == 788):
        df.loc[index, 'level1_diag3'] = 7
    elif (row['level1_diag3'] >= 140 and row['level1_diag3'] < 240):
        df.loc[index, 'level1_diag3'] = 8
    else:
        df.loc[index, 'level1_diag3'] = 0

for index, row in df.iterrows():
    if (row['level2_diag1'] >= 390 and row['level2_diag1'] < 399):
        df.loc[index, 'level2_diag1'] = 1
    elif (row['level2_diag1'] >= 401 and row['level2_diag1'] < 415):
        df.loc[index, 'level2_diag1'] = 2
    elif (row['level2_diag1'] >= 415 and row['level2_diag1'] < 460):
        df.loc[index, 'level2_diag1'] = 3
    elif (np.floor(row['level2_diag1']) == 785):
        df.loc[index, 'level2_diag1'] = 4
    elif (row['level2_diag1'] >= 460 and row['level2_diag1'] < 489):
        df.loc[index, 'level2_diag1'] = 5
    elif (row['level2_diag1'] >= 490 and row['level2_diag1'] < 497):
        df.loc[index, 'level2_diag1'] = 6
    elif (row['level2_diag1'] >= 500 and row['level2_diag1'] < 520):
        df.loc[index, 'level2_diag1'] = 7
    elif (np.floor(row['level2_diag1']) == 786):
        df.loc[index, 'level2_diag1'] = 8
    elif (row['level2_diag1'] >= 520 and row['level2_diag1'] < 530):
        df.loc[index, 'level2_diag1'] = 9
    elif (row['level2_diag1'] >= 530 and row['level2_diag1'] < 544):
        df.loc[index, 'level2_diag1'] = 10
    elif (row['level2_diag1'] >= 550 and row['level2_diag1'] < 554):
        df.loc[index, 'level2_diag1'] = 11
    elif (row['level2_diag1'] >= 555 and row['level2_diag1'] < 580):
        df.loc[index, 'level2_diag1'] = 12
    elif (np.floor(row['level2_diag1']) == 787):
        df.loc[index, 'level2_diag1'] = 13
    elif (np.floor(row['level2_diag1']) == 250):
        df.loc[index, 'level2_diag1'] = 14
    elif (row['level2_diag1'] >= 800 and row['level2_diag1'] < 1000):
        df.loc[index, 'level2_diag1'] = 15
    elif (row['level2_diag1'] >= 710 and row['level2_diag1'] < 740):
        df.loc[index, 'level2_diag1'] = 16
    elif (row['level2_diag1'] >= 580 and row['level2_diag1'] < 630):
        df.loc[index, 'level2_diag1'] = 17
    elif (np.floor(row['level2_diag1']) == 788):
        df.loc[index, 'level2_diag1'] = 18
    elif (row['level2_diag1'] >= 140 and row['level2_diag1'] < 240):
        df.loc[index, 'level2_diag1'] = 19
    elif row['level2_diag1'] >= 240 and row['level2_diag1'] < 280 and (np.floor(row['level2_diag1']) != 250):
        df.loc[index, 'level2_diag1'] = 20
    elif (row['level2_diag1'] >= 680 and row['level2_diag1'] < 710) or (np.floor(row['level2_diag1']) == 782):
        df.loc[index, 'level2_diag1'] = 21
    elif (row['level2_diag1'] >= 290 and row['level2_diag1'] < 320):
        df.loc[index, 'level2_diag1'] = 22
    else:
        df.loc[index, 'level2_diag1'] = 0
        
    if (row['level2_diag2'] >= 390 and row['level2_diag2'] < 399):
        df.loc[index, 'level2_diag2'] = 1
    elif (row['level2_diag2'] >= 401 and row['level2_diag2'] < 415):
        df.loc[index, 'level2_diag2'] = 2
    elif (row['level2_diag2'] >= 415 and row['level2_diag2'] < 460):
        df.loc[index, 'level2_diag2'] = 3
    elif (np.floor(row['level2_diag2']) == 785):
        df.loc[index, 'level2_diag2'] = 4
    elif (row['level2_diag2'] >= 460 and row['level2_diag2'] < 489):
        df.loc[index, 'level2_diag2'] = 5
    elif (row['level2_diag2'] >= 490 and row['level2_diag2'] < 497):
        df.loc[index, 'level2_diag2'] = 6
    elif (row['level2_diag2'] >= 500 and row['level2_diag2'] < 520):
        df.loc[index, 'level2_diag2'] = 7
    elif (np.floor(row['level2_diag2']) == 786):
        df.loc[index, 'level2_diag2'] = 8
    elif (row['level2_diag2'] >= 520 and row['level2_diag2'] < 530):
        df.loc[index, 'level2_diag2'] = 9
    elif (row['level2_diag2'] >= 530 and row['level2_diag2'] < 544):
        df.loc[index, 'level2_diag2'] = 10
    elif (row['level2_diag2'] >= 550 and row['level2_diag2'] < 554):
        df.loc[index, 'level2_diag2'] = 11
    elif (row['level2_diag2'] >= 555 and row['level2_diag2'] < 580):
        df.loc[index, 'level2_diag2'] = 12
    elif (np.floor(row['level2_diag2']) == 787):
        df.loc[index, 'level2_diag2'] = 13
    elif (np.floor(row['level2_diag2']) == 250):
        df.loc[index, 'level2_diag2'] = 14
    elif (row['level2_diag2'] >= 800 and row['level2_diag2'] < 1000):
        df.loc[index, 'level2_diag2'] = 15
    elif (row['level2_diag2'] >= 710 and row['level2_diag2'] < 740):
        df.loc[index, 'level2_diag2'] = 16
    elif (row['level2_diag2'] >= 580 and row['level2_diag2'] < 630):
        df.loc[index, 'level2_diag2'] = 17
    elif (np.floor(row['level2_diag2']) == 788):
        df.loc[index, 'level2_diag2'] = 18
    elif (row['level2_diag2'] >= 140 and row['level2_diag2'] < 240):
        df.loc[index, 'level2_diag2'] = 19
    elif row['level2_diag2'] >= 240 and row['level2_diag2'] < 280 and (np.floor(row['level2_diag2']) != 250):
        df.loc[index, 'level2_diag2'] = 20
    elif (row['level2_diag2'] >= 680 and row['level2_diag2'] < 710) or (np.floor(row['level2_diag2']) == 782):
        df.loc[index, 'level2_diag2'] = 21
    elif (row['level2_diag2'] >= 290 and row['level2_diag2'] < 320):
        df.loc[index, 'level2_diag2'] = 22
    else:
        df.loc[index, 'level2_diag2'] = 0
        
        
    if (row['level2_diag3'] >= 390 and row['level2_diag3'] < 399):
        df.loc[index, 'level2_diag3'] = 1
    elif (row['level2_diag3'] >= 401 and row['level2_diag3'] < 415):
        df.loc[index, 'level2_diag3'] = 2
    elif (row['level2_diag3'] >= 415 and row['level2_diag3'] < 460):
        df.loc[index, 'level2_diag3'] = 3
    elif (np.floor(row['level2_diag3']) == 785):
        df.loc[index, 'level2_diag3'] = 4
    elif (row['level2_diag3'] >= 460 and row['level2_diag3'] < 489):
        df.loc[index, 'level2_diag3'] = 5
    elif (row['level2_diag3'] >= 490 and row['level2_diag3'] < 497):
        df.loc[index, 'level2_diag3'] = 6
    elif (row['level2_diag3'] >= 500 and row['level2_diag3'] < 520):
        df.loc[index, 'level2_diag3'] = 7
    elif (np.floor(row['level2_diag3']) == 786):
        df.loc[index, 'level2_diag3'] = 8
    elif (row['level2_diag3'] >= 520 and row['level2_diag3'] < 530):
        df.loc[index, 'level2_diag3'] = 9
    elif (row['level2_diag3'] >= 530 and row['level2_diag3'] < 544):
        df.loc[index, 'level2_diag3'] = 10
    elif (row['level2_diag3'] >= 550 and row['level2_diag3'] < 554):
        df.loc[index, 'level2_diag3'] = 11
    elif (row['level2_diag3'] >= 555 and row['level2_diag3'] < 580):
        df.loc[index, 'level2_diag3'] = 12
    elif (np.floor(row['level2_diag3']) == 787):
        df.loc[index, 'level2_diag3'] = 13
    elif (np.floor(row['level2_diag3']) == 250):
        df.loc[index, 'level2_diag3'] = 14
    elif (row['level2_diag3'] >= 800 and row['level2_diag3'] < 1000):
        df.loc[index, 'level2_diag3'] = 15
    elif (row['level2_diag3'] >= 710 and row['level2_diag3'] < 740):
        df.loc[index, 'level2_diag3'] = 16
    elif (row['level2_diag3'] >= 580 and row['level2_diag3'] < 630):
        df.loc[index, 'level2_diag3'] = 17
    elif (np.floor(row['level2_diag3']) == 788):
        df.loc[index, 'level2_diag3'] = 18
    elif (row['level2_diag3'] >= 140 and row['level2_diag3'] < 240):
        df.loc[index, 'level2_diag3'] = 19
    elif row['level2_diag3'] >= 240 and row['level2_diag3'] < 280 and (np.floor(row['level2_diag3']) != 250):
        df.loc[index, 'level2_diag3'] = 20
    elif (row['level2_diag3'] >= 680 and row['level2_diag3'] < 710) or (np.floor(row['level2_diag3']) == 782):
        df.loc[index, 'level2_diag3'] = 21
    elif (row['level2_diag3'] >= 290 and row['level2_diag3'] < 320):
        df.loc[index, 'level2_diag3'] = 22
    else:
        df.loc[index, 'level2_diag3'] = 0


In [5]:
# ------ DROP USELESS COLUMNS ------
features_to_drop = ['encounter_id',
                    'patient_nbr',
                    'weight', 
                    'payer_code',
                    'medical_specialty',
                    'diag_1',
                    'diag_2',
                    'diag_3',
                    'repaglinide',
                    'nateglinide',
                    'chlorpropamide',
                    'acetohexamide',
                    'tolbutamide',
                    'acarbose',
                    'miglitol',
                    'troglitazone',
                    'tolazamide',
                    'examide',
                    'citoglipton',
                    'glyburide-metformin',
                    'glipizide-metformin',
                    'glimepiride-pioglitazone',
                    'metformin-rosiglitazone',
                    'metformin-pioglitazone']
df = df.drop(features_to_drop, axis=1)

In [6]:
# ------ DROP ROWS WITH MISSING VALUES AND/OR OUTLIERS (THAT ARE NOT CRITICAL FOR THE MODEL) ------

# Drop rows that have useless values in "discharge_disposition_id" column
# Values of 11, 13, 14, 19, 20, or 21 are related to death or hospice which mean these patients cannot be readmitted.
df = df[df['discharge_disposition_id'] != '11']
df = df[df['discharge_disposition_id'] != '13']
df = df[df['discharge_disposition_id'] != '14']
df = df[df['discharge_disposition_id'] != '19']
df = df[df['discharge_disposition_id'] != '20']
df = df[df['discharge_disposition_id'] != '21']

# Remove rows that have missing values in "race" column
df = df[df['race'] != '?']

# Drop rows that have outlier values in "gender" column
df = df[df['gender'] != 'Unknown/Invalid']

# Drop rows that have outlier values in "metformin" column
df = df[df['metformin'] != 'Up']
df = df[df['metformin'] != 'Down']

# Drop rows that have outlier values in "glimepiride" column
df = df[df['glimepiride'] != 'Up']
df = df[df['glimepiride'] != 'Down']

# Drop rows that have outlier values in "glipizide" column
df = df[df['glipizide'] != 'Up']
df = df[df['glipizide'] != 'Down']

# Drop rows that have outlier values in "glyburide" column
df = df[df['glyburide'] != 'Up']
df = df[df['glyburide'] != 'Down']

# Drop rows that have outlier values in "pioglitazone" column
df = df[df['pioglitazone'] != 'Up']
df = df[df['pioglitazone'] != 'Down']

# Drop rows that have outlier values in "rosiglitazone" column
df = df[df['rosiglitazone'] != 'Up']
df = df[df['rosiglitazone'] != 'Down']

In [7]:
# ------ DROP OUTLIERS ------

# In "number_outpatient" column, limit the outliers (that have values from 6 to 40) to a maximum value of 5
df['number_outpatient'] = df['number_outpatient'].apply(lambda x: x if x <= 5 else 5)

# In "number_emergency" column, limit the outliers to a maximum value of 5
df['number_emergency'] = df['number_emergency'].apply(lambda x: x if x <= 5 else 5)

# In "number_inpatient" column, limit the outliers to a maximum value of 5
df['number_inpatient'] = df['number_inpatient'].apply(lambda x: x if x <= 5 else 5)

# In "num_medications" column, limit the outliers to a maximum value of 40
df['num_medications'] = df['num_medications'].apply(lambda x: x if x <= 40 else 40)

# In "num_lab_procedures" column, limit the outliers to a maximum value of 90
df['num_lab_procedures'] = df['num_medications'].apply(lambda x: x if x <= 90 else 90)

# In "number_diagnoses" column, limit the outliers to a maximum value of 9
df['number_diagnoses'] = df['number_diagnoses'].apply(lambda x: x if x <= 9 else 9)

In [8]:
# ------ TURN FEATURES INTO BINARY ------

# Turn "gender" into binary (Female=0, Male=1)
df['gender'] = df['gender'].replace({'Female': 0, 'Male': 1})

# Turn "metformin" remaining values into binary (No=0, Steady=1)
df['metformin'] = df['metformin'].replace({'No': 0, 'Steady': 1})

# Turn "glimepiride" remaining values into binary (No=0, Steady=1)
df['glimepiride'] = df['glimepiride'].replace({'No': 0, 'Steady': 1})

# Turn "glipizide" remaining values into binary (No=0, Steady=1)
df['glipizide'] = df['glipizide'].replace({'No': 0, 'Steady': 1})

# Turn "glyburide" remaining values into binary (No=0, Steady=1)
df['glyburide'] = df['glyburide'].replace({'No': 0, 'Steady': 1})

# Turn "pioglitazone" remaining values into binary (No=0, Steady=1)
df['pioglitazone'] = df['pioglitazone'].replace({'No': 0, 'Steady': 1})

# Turn "rosiglitazone" remaining values into binary (No=0, Steady=1)
df['rosiglitazone'] = df['rosiglitazone'].replace({'No': 0, 'Steady': 1})

# Turn "change" into binary (No=0, Ch=1)
df['change'] = df['change'].replace({'No': 0, 'Ch': 1})

# Turn "diabetesMed" into binary (No=0, Yes=1)
df['diabetesMed'] = df['diabetesMed'].replace({'No': 0, 'Yes': 1})

In [9]:
# ------ TURN CATEGORICAL FEATURES INTO NUMERIC, WITH ORDINAL ENCODING ------
# Turn "age" into ordinal encoding
age_mapping = {'[0-10)': 0,
               '[10-20)': 1,
               '[20-30)': 2,
               '[30-40)': 3,
               '[40-50)': 4,
               '[50-60)': 5,
               '[60-70)': 6,
               '[70-80)': 7,
               '[80-90)': 8,
               '[90-100)': 9}
df['age'] = df['age'].replace(age_mapping)

# Turn "max_glu_serum" into ordinal encoding
df['max_glu_serum'].fillna(0, inplace=True)
df['max_glu_serum'] = df['max_glu_serum'].replace({'Norm': 0, '>200': 1, '>300': 2})

# Turn "A1Cresult" into ordinal encoding
df['A1Cresult'].fillna(0, inplace=True)
df['A1Cresult'] = df['A1Cresult'].replace({'Norm': 0, '>7': 1, '>8': 2})

In [10]:
# ------ TURN CATEGORICAL FEATURES INTO NUMERIC, WITH DUMMY ENCODING ------

# Turn "race" into dummy encoding (Caucasian, AfricanAmerican, Hispanic, Other, Asian)
df = pd.get_dummies(df, columns=['race'])

# Turn "insulin" into dummy encoding (No, Steady, Up, Down)
df = pd.get_dummies(df, columns=['insulin'])

# In "admission_type_id" column, merge/simplify values into fewer categories (from 1-8 to 1,3,4,5) and then turn it
# into dummy encoding (1: Emergency, 3: Elective, 4: Newborn, 5: Other)
df['admission_type_id'] = df['admission_type_id'].replace(2,1)
df['admission_type_id'] = df['admission_type_id'].replace(7,1)
df['admission_type_id'] = df['admission_type_id'].replace(6,5)
df['admission_type_id'] = df['admission_type_id'].replace(8,5)
df['admission_type_id'] = df['admission_type_id'].replace({'1': "Emergency", '3': "Elective", '4': "Newborn", '5': "Other"})
df = pd.get_dummies(df, columns=['admission_type_id'])

# In "admission_source_id" column, merge/simplify values into fewer categories and then turn it
# into dummy encoding (1: Physician Referral, 4: Transfer from a hospital, 9: Transfer from a Skilled Nursing Facility, 11: Other)
df['admission_source_id'] = df['admission_source_id'].replace(2,1)
df['admission_source_id'] = df['admission_source_id'].replace(3,1)
df['admission_source_id'] = df['admission_source_id'].replace(5,4)
df['admission_source_id'] = df['admission_source_id'].replace(6,4)
df['admission_source_id'] = df['admission_source_id'].replace(10,4)
df['admission_source_id'] = df['admission_source_id'].replace(22,4)
df['admission_source_id'] = df['admission_source_id'].replace(25,4)
df['admission_source_id'] = df['admission_source_id'].replace(15,9)
df['admission_source_id'] = df['admission_source_id'].replace(17,9)
df['admission_source_id'] = df['admission_source_id'].replace(20,9)
df['admission_source_id'] = df['admission_source_id'].replace(21,9)
df['admission_source_id'] = df['admission_source_id'].replace(8,11)
df['admission_source_id'] = df['admission_source_id'].replace(13,11)
df['admission_source_id'] = df['admission_source_id'].replace(14,11)
df['admission_source_id'] = df['admission_source_id'].replace({'1': "Physician_Referral", '4': "Transfer_Hospital", '7': "Emergency_Room", '9': "Transfer_Nursing", '11': "Other"})
df = pd.get_dummies(df, columns=['admission_source_id'])

In [11]:
# ------ CHECKING FINAL DATA SET PROPORTION ------
df['readmitted'].value_counts()

readmitted
NO     50640
>30    33246
<30    10605
Name: count, dtype: int64

In [12]:
# ------ RE BALANCE DATA SET  ------
from sklearn.utils import resample

# Put values for 'readmitted' == 1 on a separate dataframe
df_1 = df[df.readmitted=='<30']

# Delete the rows with 'readmitted' == 1 from the original dataframe
df = df[df.readmitted!='<30']

# Encode the 'readmitted' column to 1
df['readmitted'] = df['readmitted'].replace('>30', 1)
df['readmitted'] = df['readmitted'].replace('NO', 0)

# Separate majority and minority classes
df_majority = df[df.readmitted==0]
df_minority = df[df.readmitted==1]

# Upsample minority classes
df_majority_downsample = resample(df_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=35000,    # to match majority class
                                 random_state=123) # reproducible results


# Stack results to original dataframe
df = pd.concat([df_majority_downsample, df_minority])

# Change values of 1 back to '>30'
df['readmitted'] = df['readmitted'].replace(1,'>30')

# Stack back rows that were deleted
df = pd.concat([df, df_1])

# Change values of 0 back to 'NO'
df['readmitted'] = df['readmitted'].replace(0, 'NO')

# Display new class counts
df.readmitted.value_counts()


readmitted
NO     35000
>30    33246
<30    10605
Name: count, dtype: int64

## Step 3 - Splitting & normalizing the dataset

In [13]:
# ------ SPLIT PREPARED DATASET INTO: X and y  ------
X = df.drop('readmitted', axis=1)
y = df['readmitted']

In [14]:
# ------ NORMALIZE PREPARED DATASET ------
# Normalize the dataset using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [15]:
# ------ SPLIT DATASET INTO TRAIN AND TEST ------
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Step 4 - Training & evaluating multi-classification models

In [17]:
# Import multi-classification models
from sklearn.linear_model import LogisticRegression # Use OVR strategy
from sklearn.svm import SVC # Use OVR strategy
from sklearn.multiclass import OneVsRestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


# Define the models
models = []
models.append(('Logistic Regression', LogisticRegression(multi_class='ovr', solver='liblinear')))
#models.append(('SVM', OneVsRestClassifier(SVC())))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('Random Forest', RandomForestClassifier()))
#models.append(('XGBoost', XGBClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Naive Bayes', GaussianNB()))


# Train and evaluate each model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
results = []
names = []
for name, model in models:
    # kfold = StratifiedKFold(n_splits=2, random_state=1, shuffle=True)
    # cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    cv_results = model.fit(X_train, y_train).score(X_test, y_test)
    results.append(cv_results)
    names.append(name)
    print('%s: %f' % (name, cv_results)) 

Logistic Regression: 0.531955
LDA: 0.527137
Random Forest: 0.629090
Decision Tree: 0.545270


AttributeError: 'Flags' object has no attribute 'c_contiguous'