In [87]:
# Import dependencies 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import collections
import researchpy as rp
import scipy as sp
from sklearn.preprocessing import LabelEncoder
import re
import os
import matplotlib.pyplot as plt

# For database
import sqlalchemy
from sqlalchemy import create_engine
import psycopg2
from config import db_password

## Add code here to connect to DB to read raw data

In [88]:
# Database Credentials
DB_HOST = "127.0.0.1"
DB_PORT = "5433"
DB_NAME = "DiabeticDB"
DB_USER = "postgres"
DB_PASS = db_password
CSV_FILE_PATH = "/Users/monaeem/Final_Project/dataviz-final-project-group4_/database/diabetic_data_initial.csv"

In [89]:
CSV_FILE_PATH

'/Users/monaeem/Final_Project/dataviz-final-project-group4_/database/diabetic_data_initial.csv'

In [90]:
conn = psycopg2.connect(database=DB_NAME, user=DB_USER, password=db_password, host=DB_HOST, port=DB_PORT)
print("Database opened successfully")
# Create a cursor
cur = conn.cursor()
# Drop the diabetes_raw_data if exists before creating one
cur.execute("DROP TABLE IF EXISTS medicines_info");
cur.execute("DROP TABLE IF EXISTS admission_info");
cur.execute("DROP TABLE IF EXISTS diagnosis_info");
cur.execute("DROP TABLE IF EXISTS patient_info");
cur.execute("DROP TABLE IF EXISTS diabetes_clean_data");
cur.execute("DROP TABLE IF EXISTS diabetes_raw_data");
conn.commit()

Database opened successfully


In [91]:
# Create diabetes_raw_data table
cur = conn.cursor()
cur.execute('''CREATE TABLE diabetes_raw_data (
    encounter_id int NOT NULL,
    patient_nbr	int NOT NULL,
    race varchar(20),
    gender varchar(20) NOT NULL,
    age varchar(10) NOT NULL,
    weight varchar(10),
    admission_type_id int NOT NULL,
    discharge_disposition_id int NOT NULL,
    admission_source_id int NOT NULL,
    time_in_hospital int NOT NULL,
    payer_code varchar(10),
    medical_specialty varchar(40),
    num_lab_procedures  int NOT NULL,
    num_procedures  int NOT NULL,
    num_medications  int NOT NULL,
    number_outpatient  int NOT NULL,
    number_emergency  int NOT NULL,
    number_inpatient  int NOT NULL,
    diag_1  varchar(10),
    diag_2 varchar(10),
    diag_3 varchar(10),
    number_diagnoses  int NOT NULL,
    max_glu_serum  varchar(10),
    A1Cresult  varchar(10),
    metformin  varchar(10),
    repaglinide  varchar(10),
    nateglinide  varchar(10),
    chlorpropamide  varchar(10),
    glimepiride  varchar(10),
    acetohexamide  varchar(10),
    glipizide  varchar(10),
    glyburide  varchar(10),
    tolbutamide  varchar(10),
    pioglitazone  varchar(10),
    rosiglitazone  varchar(10),
    acarbose  varchar(10),
    miglitol  varchar(10),
    troglitazone  varchar(10),
    tolazamide  varchar(10),
    examide  varchar(10),
    citoglipton  varchar(10),
    insulin  varchar(10),
    "glyburide-metformin"  varchar(10),
    "glipizide-metformin"  varchar(10),
    "glimepiride-pioglitazone"  varchar(10),
    "metformin-rosiglitazone"  varchar(10),
    "metformin-pioglitazone"  varchar(10),
    change  varchar(10),
    diabetesMed  varchar(10),
    readmitted  varchar(10),
    PRIMARY KEY (encounter_id),
    UNIQUE (encounter_id));''')
print("Table 'diabetes_raw_data' successfully created")

Table 'diabetes_raw_data' successfully created


In [92]:
# Copy the contents from the diabetic_data_initial.csv and write it to 'diabetes_raw_data' table
cur = conn.cursor()
cur.execute('''COPY diabetes_raw_data
    FROM '/Users/monaeem/Final_Project/dataviz-final-project-group4_/database/diabetic_data_initial.csv'
    CSV HEADER DELIMITER ',';''')

In [93]:
# Create the connection to the PostgreSQL database
db_string = f"postgres://{DB_USER}:{db_password}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(db_string)
conn.commit()

In [94]:
# Read the raw data from the postgres into dataframe
df = pd.read_sql_table("diabetes_raw_data", engine)
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,a1cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesmed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,MC,?,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,MC,?,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,MC,?,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,MC,Surgery-General,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO


In [95]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,a1cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesmed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,MC,?,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,MC,?,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,MC,?,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,MC,Surgery-General,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO


In [96]:
# Show columns with missing greater than 20%
for column in df:
    if df[df[column]=='?'].shape[0]/df.shape[0]*100 > 20:
        print(column,":",str(round(df[df[column]=='?'].shape[0]/df.shape[0]*100)) + "%")              

weight : 97%
payer_code : 40%
medical_specialty : 49%


In [97]:
# variables weight and payer_code were excluded due to quantity of missing data
# medical specialty was recoded to add "missing" for the missing values.

In [98]:
# drop weight and payer_code columns 
df_copy = df.copy()
df_copy.drop(columns=['weight', 'payer_code'], axis=1, inplace=True)

In [99]:
# In order to keep observations independent, only one the first encounter is included
# Dedupe based on first encounter
df_deduped = df_copy.drop_duplicates(subset=['patient_nbr'], keep='first')

In [100]:
# Remove encounters that resulted in either discharge to 
# a hospice or patient death to avoid biasing analysis 
discharge_disposition_excluded=[11, 13, 14, 19, 20, 23]

df_cleaned = df_deduped[~df_deduped.discharge_disposition_id.isin(discharge_disposition_excluded)]

In [101]:
df_cleaned['gender'].value_counts()

Female             37084
Male               32626
Unknown/Invalid        3
Name: gender, dtype: int64

In [102]:
df_cleaned.drop(df_cleaned[(df_cleaned.loc[:,'gender'] == "Unknown/Invalid")].index, inplace = True)
df_cleaned['gender'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Female    37084
Male      32626
Name: gender, dtype: int64

In [103]:
# recode readmitted to be binary 
# def recode_readmit(x):
#     if x == '<30':
#         return '<30'
#     else:
#         return 'NO'
    
# df_cleaned['readmitted_recoded'] = df_cleaned[df_cleaned.loc[:,'readmitted']].apply(recode_readmit)
df_cleaned['readmitted_recoded'] = df_cleaned.loc[:,['readmitted']].replace({'>30': 'NO'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [104]:
# recode medical_specialty to add missing 
df_cleaned['medical_specialty_recoded'] = df_cleaned.loc[:,['medical_specialty']].replace("?",'missing', inplace=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [105]:
# readmitted recoded is imbalanced with 91% of cases 
# not having a 30 day readmission
print(collections.Counter(df_cleaned['readmitted_recoded']))

Counter({'NO': 63441, '<30': 6269})


In [106]:
df_cleaned['medical_specialty_recoded'].value_counts()

missing                             33507
InternalMedicine                    10582
Family/GeneralPractice               4955
Emergency/Trauma                     4390
Cardiology                           4198
                                    ...  
Perinatology                            1
Psychiatry-Addictive                    1
Neurophysiology                         1
Resident                                1
Surgery-PlasticwithinHeadandNeck        1
Name: medical_specialty_recoded, Length: 71, dtype: int64

In [107]:
df_cleaned['medical_specialty_recoded'] = df_cleaned.loc[:,['medical_specialty_recoded']].replace({'Family/GeneralPractice': 'InternalMedicine'})

def values_to_other(col_name, value_unchanged):
    v=[]
    for value in col_name:
        if (value != value_unchanged):
            v.append("Other")
        else:
            v.append(value)
    return v
    

df_cleaned['medical_specialty_recoded'] = values_to_other(df_cleaned['medical_specialty_recoded'].values,"InternalMedicine")    



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [108]:
df_cleaned['medical_specialty_recoded'].value_counts()

Other               54173
InternalMedicine    15537
Name: medical_specialty_recoded, dtype: int64

In [109]:
# readmitted recoded is imbalanced with 91% of cases 
# not having a 30 day readmission
print(collections.Counter(df_cleaned['readmitted_recoded']))

Counter({'NO': 63441, '<30': 6269})


In [110]:
# function to clean 'age' column
def parse_age_range(age_col):
    c=[]
    for values in age_col:
        s = re.sub('[[)]','', values)
        c.append(s)
    return c

# replace 'age' values with cleaned values
df_cleaned['age'] = parse_age_range(df_cleaned['age'].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [111]:
df_cleaned = df_cleaned.drop(columns=['readmitted', 'medical_specialty'])

In [112]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [113]:
df_cleaned.dtypes

encounter_id                  int64
patient_nbr                   int64
race                         object
gender                       object
age                          object
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
diag_1                       object
diag_2                       object
diag_3                       object
number_diagnoses              int64
max_glu_serum                object
a1cresult                    object
metformin                    object
repaglinide                  object
nateglinide                  object
chlorpropamide               object
glimepiride                  object
acetohexamide                object
glipizide                   

# DB code goes here

In [114]:
# DB code to export cleaned dataset to DB
# Create a cursor
cur = conn.cursor()
# Drop the diabetes_cleaned_data if exists before creating one
cur.execute("DROP TABLE IF EXISTS diabetes_clean_data");
conn.commit()

In [115]:
# Write the cleaned data fro diabetes_clean_data_df into postgres diabetes_clean_data table 
df_cleaned.to_sql(name='diabetes_clean_data', con=engine, if_exists='replace', index=False,
            dtype={'encounter_id': sqlalchemy.types.INTEGER(),
                   'patient_nbr' : sqlalchemy.types.INTEGER(),
                   'race' : sqlalchemy.types.VARCHAR(length=20),
                   'gender' : sqlalchemy.types.VARCHAR(length=20),
                   'age' : sqlalchemy.types.VARCHAR(length=10),
    'admission_type_id' : sqlalchemy.types.INTEGER(),
    'discharge_disposition_id' : sqlalchemy.types.INTEGER(),
    'admission_source_id' : sqlalchemy.types.INTEGER(),
    'time_in_hospital' : sqlalchemy.types.INTEGER(),
    'num_lab_procedures' :  sqlalchemy.types.INTEGER(),
    'num_procedures' :  sqlalchemy.types.INTEGER(),
    'num_medications' :  sqlalchemy.types.INTEGER(),
    'number_outpatient' :  sqlalchemy.types.INTEGER(),
    'number_emergency' :  sqlalchemy.types.INTEGER(),
    'number_inpatient' :  sqlalchemy.types.INTEGER(),
    'diag_1' :  sqlalchemy.types.VARCHAR(length=10),
    'diag_2' : sqlalchemy.types.VARCHAR(length=10),
    'diag_3' : sqlalchemy.types.VARCHAR(length=10),
    'number_diagnoses' :  sqlalchemy.types.INTEGER(),
    'max_glu_serum' :  sqlalchemy.types.VARCHAR(length=10),
    'A1Cresult' :  sqlalchemy.types.VARCHAR(length=10),
    'metformin' :  sqlalchemy.types.VARCHAR(length=10),
    'repaglinide' :  sqlalchemy.types.VARCHAR(length=10),
    'nateglinide' :  sqlalchemy.types.VARCHAR(length=10),
    'chlorpropamide' :  sqlalchemy.types.VARCHAR(length=10),
    'glimepiride' :  sqlalchemy.types.VARCHAR(length=10),
    'acetohexamide' :  sqlalchemy.types.VARCHAR(length=10),
    'glipizide' :  sqlalchemy.types.VARCHAR(length=10),
    'glyburide' :  sqlalchemy.types.VARCHAR(length=10),
    'tolbutamide' :  sqlalchemy.types.VARCHAR(length=10),
    'pioglitazone' :  sqlalchemy.types.VARCHAR(length=10),
    'rosiglitazone' :  sqlalchemy.types.VARCHAR(length=10),
    'acarbose' :  sqlalchemy.types.VARCHAR(length=10),
    'miglitol' :  sqlalchemy.types.VARCHAR(length=10),
    'troglitazone' :  sqlalchemy.types.VARCHAR(length=10),
    'tolazamide' :  sqlalchemy.types.VARCHAR(length=10),
    'examide' :  sqlalchemy.types.VARCHAR(length=10),
    'citoglipton' :  sqlalchemy.types.VARCHAR(length=10),
    'insulin' :  sqlalchemy.types.VARCHAR(length=10),
    'glyburide-metformin' :  sqlalchemy.types.VARCHAR(length=10),
    'glipizide-metformin' :  sqlalchemy.types.VARCHAR(length=10),
    'glimepiride-pioglitazone' : sqlalchemy.types.VARCHAR(length=10),
    'metformin-rosiglitazone' :  sqlalchemy.types.VARCHAR(length=10),
    'metformin-pioglitazone' :  sqlalchemy.types.VARCHAR(length=10),
    'change' : sqlalchemy.types.VARCHAR(length=10),
    'diabetesMed' :  sqlalchemy.types.VARCHAR(length=10),
    'readmitted_recoded' :  sqlalchemy.types.VARCHAR(length=10),
    'medical_specialty_recoded' : sqlalchemy.types.VARCHAR(length=40)})
print("Table 'diabetes_clean_data' successfully created")

Table 'diabetes_clean_data' successfully created


# Data Pre-Processing

# DB code goes here

In [156]:
# Read the raw data from the postgres into dataframe
df_cleaned = pd.read_sql_table("diabetes_clean_data", engine)
df_cleaned

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,a1cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesmed,readmitted_recoded,medical_specialty_recoded
0,2278392,8222157,Caucasian,Female,0-10,6,25,1,1,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Other
1,149190,55629189,Caucasian,Female,10-20,1,1,7,3,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Other
2,64410,86047875,AfricanAmerican,Female,20-30,1,1,7,2,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Other
3,500364,82442376,Caucasian,Male,30-40,1,1,7,2,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Other
4,16680,42519267,Caucasian,Male,40-50,1,1,7,1,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69705,443842016,183087545,Caucasian,Female,70-80,1,1,7,9,50,2,33,0,0,0,574,574,250.02,9,,>7,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Other
69706,443842022,188574944,Other,Female,40-50,1,1,7,14,73,6,26,0,1,0,592,599,518,9,,>8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Other
69707,443842070,140199494,Other,Female,60-70,1,1,7,2,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,Other
69708,443842340,120975314,Caucasian,Female,80-90,1,1,7,5,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Other


In [157]:
# Create a list of columns to encode for each variable if variable type is object
columns_to_encode = [column for column in df_cleaned.columns if df_cleaned[column].dtypes == 'O']


In [158]:
print(columns_to_encode)

['race', 'gender', 'age', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded', 'medical_specialty_recoded']


In [159]:
# exclude diag 1, 2, and 3 from recode list 
for i in range(3):
    to_remove = 'diag_' + str(i+1)
    columns_to_encode.remove(to_remove)
print(columns_to_encode)
print("\n", len(columns_to_encode), "columns")

['race', 'gender', 'age', 'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded', 'medical_specialty_recoded']

 32 columns


In [160]:
encoded_df = df_cleaned.copy()

In [161]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [162]:
# function to apply label encoding
def apply_encoder(cols):
    le = LabelEncoder()
    for c in cols:
        new_column_name = c + "_le"
        le.fit(df_cleaned[c])
        encoded_df[new_column_name] = le.transform(encoded_df[c])
        print(c, ":", le.classes_)

In [163]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [164]:
# Apply Encoding
apply_encoder(columns_to_encode)

race : ['?' 'AfricanAmerican' 'Asian' 'Caucasian' 'Hispanic' 'Other']
gender : ['Female' 'Male']
age : ['0-10' '10-20' '20-30' '30-40' '40-50' '50-60' '60-70' '70-80' '80-90'
 '90-100']
max_glu_serum : ['>200' '>300' 'None' 'Norm']
a1cresult : ['>7' '>8' 'None' 'Norm']
metformin : ['Down' 'No' 'Steady' 'Up']
repaglinide : ['Down' 'No' 'Steady' 'Up']
nateglinide : ['Down' 'No' 'Steady' 'Up']
chlorpropamide : ['Down' 'No' 'Steady' 'Up']
glimepiride : ['Down' 'No' 'Steady' 'Up']
acetohexamide : ['No' 'Steady']
glipizide : ['Down' 'No' 'Steady' 'Up']
glyburide : ['Down' 'No' 'Steady' 'Up']
tolbutamide : ['No' 'Steady']
pioglitazone : ['Down' 'No' 'Steady' 'Up']
rosiglitazone : ['Down' 'No' 'Steady' 'Up']
acarbose : ['No' 'Steady' 'Up']
miglitol : ['Down' 'No' 'Steady' 'Up']
troglitazone : ['No' 'Steady']
tolazamide : ['No' 'Steady']
examide : ['No']
citoglipton : ['No']
insulin : ['Down' 'No' 'Steady' 'Up']
glyburide-metformin : ['Down' 'No' 'Steady' 'Up']
glipizide-metformin : ['No' 'Stea

In [165]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [166]:
encoded_df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded',
       'medical_specialty_recoded', 'race_le', 'gender_le', 'age_le',
       'max_glu_serum_le', 'a1cresult_le', 'metformin_le', 'repagl

In [167]:
cols_to_drop = ['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'a1cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesmed', 'readmitted_recoded',
       'medical_specialty_recoded']
encoded_df = encoded_df.drop(columns=cols_to_drop)

In [168]:
print(len(encoded_df.columns),"columns")

32 columns


In [169]:
# Set ? to NaN
encoded_df.replace('?', np.nan, inplace=True)

In [170]:
# Show missing in dataset 
for column in df_cleaned.columns:
    missing_count = df_cleaned[column].isnull().sum()
    if missing_count>0:
        print(column,":",df_cleaned)        

# TO BE INCORPORATED LATER

In [None]:
# # check multivariate outliers 
# # credit to https://www.machinelearningplus.com/statistics/mahalanobis-distance/
# def mahalanobis(x=None, data=None, cov=None):
#     """Compute the Mahalanobis Distance between each row of x and the data  
#     x    : vector or matrix of data with, say, p columns.
#     data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
#     cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
#     """
#     x_minus_mu = x - np.mean(data)
#     if not cov:
#         cov = np.cov(data.values.T)# covarience of data.values transposed 
#     inv_covmat = sp.linalg.inv(cov)
#     left_term = np.dot(x_minus_mu, inv_covmat)
#     mahal = np.dot(left_term, x_minus_mu.T)
#     return mahal.diagonal()

In [None]:
# #create new column in dataframe that contains Mahalanobis distance for each row
# df_x = df_cleaned[['time_in_hospital', 'num_lab_procedures', 'num_procedures',
#        'num_medications', 'number_outpatient', 'number_emergency','number_diagnoses']]
# df_x.head()


In [None]:
# # df_x = df[['carat', 'depth', 'price']].head(500)
# df_cleaned['mahala'] = mahalanobis(x=df_x, data=df_cleaned[['time_in_hospital', 'num_lab_procedures', 'num_procedures',
#        'num_medications', 'number_outpatient', 'number_emergency','number_diagnoses']])
# df_cleaned.head()

In [None]:
# a=df_cleaned[df_cleaned['mahala']<10].count()

In [None]:
# b=df_cleaned[df_cleaned['mahala']>10].count()

In [None]:
# df_cleaned[df_cleaned['mahala']<10]

In [None]:
# # Check multicollinearity 
# # credit https://www.analyticsvidhya.com/blog/2020/03/what-is-multicollinearity/
# # Import library for VIF - varience inflation factor 
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# def calc_vif(X):

#     # Calculating VIF
#     vif = pd.DataFrame()
#     vif["variables"] = X.columns
#     vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

#     return(vif)

In [None]:
# # variables with high VIF might be removed one at a time in descending order
# calc_vif(df_cleaned.iloc[:,:-1])

# Summary Statistics: EDA

In [None]:
# 69,710 records in the dataset 
display(df_cleaned.describe())


In [None]:
# 69,710 records in the dataset 
display(df_cleaned.describe())


In [None]:
# 53.25% of admissions came from Trauma Center followed by ED
display(rp.summary_cat(df_cleaned[['admission_source_id']]))


In [None]:
# ~75% of cases were caucasian followed by 18% for African American 
display(rp.summary_cat(df_cleaned[['race']]))


In [None]:
# 53% cases were female, 47% male 
display(rp.summary_cat(df_cleaned[['gender']]))
df_cleaned.gender.value_counts().plot(kind='bar', title='gender')
plt.show()


In [None]:
#~81% of cases above age 50
df_cleaned.age.value_counts().plot(kind='bar')
display(rp.summary_cat(df_cleaned[['age']]))


In [None]:
# Average time in hospital 4.26 days 
df_cleaned.agg(
    {
    'num_medications':["min", "max", "mean","median", "skew"], 
    'num_lab_procedures':["min", "max", "mean","median", "skew"],
        'time_in_hospital':["min", "max", "mean","median", "skew"], 
        'num_lab_procedures':["min", "max", "mean","median", "skew"], 
                  'num_procedures':["min", "max", "mean","median", "skew"], 
        'num_medications':["min", "max", "mean","median", "skew"], 
                  'number_outpatient':["min", "max", "mean","median", "skew"], 
        'number_emergency':["min", "max", "mean","median", "skew"], 
        'number_inpatient':["min", "max", "mean","median", "skew"], 
        'number_diagnoses':["min", "max", "mean","median", "skew"]
    }
)

In [None]:
# average time in hospital slightly longer (4.78 days) for those readmitted less than 30 days vs those not readmitted (4.21 days)
df_cleaned.loc[:,['time_in_hospital', 'num_lab_procedures', 
                  'num_procedures', 'num_medications', 
                  'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']].groupby(df_cleaned['readmitted_recoded']).mean()

In [None]:
fig, ax = plt.subplots()

sum_cols = ['time_in_hospital', 'num_lab_procedures', 
                  'num_procedures', 'num_medications', 
                  'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
ax.boxplot(df_cleaned[sum_cols].values)
plt.xticks([1, 2, 3,4,5,6,7,8], sum_cols, rotation='vertical')
plt.show()

In [None]:
df_cleaned.time_in_hospital.hist()

# Machine Learning Model: LogisticRegression

In [171]:
y = encoded_df["readmitted_recoded_le"]
X = encoded_df.drop(columns=['readmitted_recoded_le'])

In [172]:
from sklearn.model_selection import train_test_split

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [None]:
# # rebalance data 
# from imblearn.under_sampling import RandomUnderSampler
# from collections import Counter
# ros = RandomUnderSampler(random_state=1)
# X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# Counter(y_resampled)

In [173]:
# Define the logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=500, random_state=1)

In [174]:
# Train the model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=1)

In [175]:
# Evaluate the model
y_pred = classifier.predict(X_test)

In [176]:
d = {'Predicted': y_pred, 'Actual': y_test}
check_df = pd.DataFrame(data=d)
check_df.head()

Unnamed: 0,Predicted,Actual
4783,1,1
66950,1,1
18662,1,1
24444,1,1
13630,1,1


In [177]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9100872159742942


In [178]:
# Preprocess
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [179]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=500)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.910


In [180]:
# Determine the shape of our training and testing sets.
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

(52282, 31)
(17428, 31)
(52282,)
(17428,)


In [182]:
# Get Feature Importance

# Create a random forest classifier.
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=128, random_state=78) 

# Fitting the model
model = model.fit(X_train_scaled, y_train)

# Calculate feature importance in the Random Forest model.
importances = model.feature_importances_

# We can sort the features by their importance.
feature_imp = sorted(zip(importances, X.columns), reverse=True)
for features in feature_imp:
    print(f"{features}")

(0.20653578488715751, 'age_le')
(0.12508202653060385, 'race_le')
(0.1075796940127439, 'a1cresult_le')
(0.08560480814510207, 'insulin_le')
(0.0693498026125794, 'gender_le')
(0.055524376899438684, 'glipizide_le')
(0.04867919246233754, 'max_glu_serum_le')
(0.04731520068109499, 'glyburide_le')
(0.04360419914347161, 'metformin_le')
(0.042301423843886035, 'medical_specialty_recoded_le')
(0.04084765902682219, 'pioglitazone_le')
(0.03827241341724251, 'rosiglitazone_le')
(0.0332839467435013, 'glimepiride_le')
(0.01617581504794587, 'repaglinide_le')
(0.01425873196251125, 'change_le')
(0.007524092472914288, 'nateglinide_le')
(0.00697869304743679, 'glyburide-metformin_le')
(0.004467362734591162, 'diabetesmed_le')
(0.004304111433777287, 'acarbose_le')
(0.0007960345723092245, 'chlorpropamide_le')
(0.0005559251567729106, 'tolbutamide_le')
(0.00043858922088415404, 'tolazamide_le')
(0.00043029130712290113, 'miglitol_le')
(3.415748427151087e-05, 'glipizide-metformin_le')
(2.9906984456839376e-05, 'trogli

In [183]:
X = encoded_df.drop(columns=['readmitted_recoded_le', 'pioglitazone_le', 'rosiglitazone_le', 'glimepiride_le', 'repaglinide_le',
                            'change_le', 'nateglinide_le', 'glyburide-metformin_le', 'diabetesmed_le', 'acarbose_le', 'chlorpropamide_le',
                            'tolbutamide_le', 'tolazamide_le', 'miglitol_le', 'glipizide-metformin_le', 'troglitazone_le', 'metformin-rosiglitazone_le',
                            'acetohexamide_le', 'metformin-pioglitazone_le', 'glimepiride-pioglitazone_le', 'examide_le', 'citoglipton_le'])

In [184]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [185]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [186]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=500)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.910


In [187]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[    0  1567]
 [    0 15861]]


In [188]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1567
           1       0.91      1.00      0.95     15861

    accuracy                           0.91     17428
   macro avg       0.46      0.50      0.48     17428
weighted avg       0.83      0.91      0.87     17428



  _warn_prf(average, modifier, msg_start, len(result))
