In [77]:
# Import dependencies 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import collections
import researchpy as rp
from sklearn.preprocessing import LabelEncoder

In [92]:
pd.set_option('display.max_columns', None)

df = pd.read_csv("diabetic_data_initial.csv")

In [27]:
# Show columns with missing greater than 20%
for column in df:
    if df[df[column]=='?'].shape[0]/df.shape[0]*100 > 20:
        print(column,":",str(round(df[df[column]=='?'].shape[0]/df.shape[0]*100)) + "%")              

weight : 97%
payer_code : 40%
medical_specialty : 49%


In [28]:
# variables weight and payer_code were excluded due to quantity of missing data
# medical specialty was recoded to add "missing" for the missing values.

In [29]:
# drop weight and payer_code columns 
df_copy = df.copy()
df_copy.drop(columns=['weight', 'payer_code'], axis=1, inplace=True)

In [30]:
# In order to keep observations independent, only one the first encounter is included
# Dedupe based on first encounter
df_deduped = df_copy.drop_duplicates(subset=['patient_nbr'], keep='first')

In [31]:
# Remove encounters that resulted in either discharge to 
# a hospice or patient death to avoid biasing analysis 
discharge_disposition_excluded=[11, 13, 14, 19, 20, 23]

df_cleaned = df_deduped[~df_deduped.discharge_disposition_id.isin(discharge_disposition_excluded)]

In [32]:
# recode readmitted to be binary 
def recode_readmit(x):
    if x == '<30':
        return '<30'
    else:
        return 'NO'
    
df_cleaned['readmitted_recoded'] = df_cleaned['readmitted'].apply(recode_readmit)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [33]:
# recode medical_specialty to add missing 
df_cleaned['medical_specialty_recoded'] = df_cleaned.loc[:,['medical_specialty']].replace("?",'missing', inplace=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [34]:
# readmitted recoded is imbalanced with 91% of cases 
# not having a 30 day readmission
print(collections.Counter(df_cleaned['readmitted_recoded']))

Counter({'NO': 63444, '<30': 6269})


In [35]:
# Create a list of columns to encode for each variable if variable type is object
columns_to_encode = [column for column in df_cleaned.columns if df_cleaned[column].dtypes == 'O']

In [36]:
print(columns_to_encode)

['race', 'gender', 'age', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'readmitted_recoded', 'medical_specialty_recoded']


In [37]:
# exclude diag 1, 2, and 3 from recode list 
for i in range(3):
    to_remove = 'diag_' + str(i+1)
    columns_to_encode.remove(to_remove)
print(columns_to_encode)
print("\n", len(columns_to_encode), "columns")

['race', 'gender', 'age', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted', 'readmitted_recoded', 'medical_specialty_recoded']

 34 columns


In [38]:
# fit transform for label encoder
columns_to_encode
le = LabelEncoder()

for column in columns_to_encode:
    new_column_name = column + "_le"
    df_cleaned[new_column_name] = le.fit_transform(df_cleaned[column])
    print(column, ":", le.classes_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


race : ['?' 'AfricanAmerican' 'Asian' 'Caucasian' 'Hispanic' 'Other']
gender : ['Female' 'Male' 'Unknown/Invalid']
age : ['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']
medical_specialty : ['?' 'AllergyandImmunology' 'Anesthesiology' 'Anesthesiology-Pediatric'
 'Cardiology' 'Cardiology-Pediatric' 'DCPTEAM' 'Dentistry' 'Dermatology'
 'Emergency/Trauma' 'Endocrinology' 'Endocrinology-Metabolism'
 'Family/GeneralPractice' 'Gastroenterology' 'Gynecology' 'Hematology'
 'Hematology/Oncology' 'Hospitalist' 'InfectiousDiseases'
 'InternalMedicine' 'Nephrology' 'Neurology' 'Neurophysiology'
 'Obsterics&Gynecology-GynecologicOnco' 'Obstetrics'
 'ObstetricsandGynecology' 'Oncology' 'Ophthalmology' 'Orthopedics'
 'Orthopedics-Reconstructive' 'Osteopath' 'Otolaryngology'
 'OutreachServices' 'Pathology' 'Pediatrics' 'Pediatrics-CriticalCare'
 'Pediatrics-EmergencyMedicine' 'Pediatrics-Endocrinology'
 'Pediatrics-Hematology-Oncology' 'Pediatrics-

In [46]:
print(len(df_cleaned.columns),"columns")

84 columns


In [40]:
# Set ? to NaN
df_cleaned.replace('?', np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [41]:
# Show missing in dataset 
for column in df_cleaned.columns:
    missing_count = df_cleaned[column].isnull().sum()
    if missing_count>0:
        print(column,":",missing_count)        

race : 1914
medical_specialty : 33509
diag_1 : 10
diag_2 : 292
diag_3 : 1221


In [18]:
df_cleaned.to_csv('diabetes_dataset_cleaned.csv')

# Summary Statistics: EDA

In [93]:
display(df_cleaned.describe())

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_le,gender_le,age_le,medical_specialty_le,max_glu_serum_le,A1Cresult_le,metformin_le,repaglinide_le,nateglinide_le,chlorpropamide_le,glimepiride_le,acetohexamide_le,glipizide_le,glyburide_le,tolbutamide_le,pioglitazone_le,rosiglitazone_le,acarbose_le,miglitol_le,troglitazone_le,tolazamide_le,examide_le,citoglipton_le,insulin_le,glyburide-metformin_le,glipizide-metformin_le,glimepiride-pioglitazone_le,metformin-rosiglitazone_le,metformin-pioglitazone_le,change_le,diabetesMed_le,readmitted_le,readmitted_recoded_le,medical_specialty_recoded_le
count,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0,69713.0
mean,156561700.0,54942530.0,2.107742,3.339535,5.638217,4.261802,42.859108,1.424311,15.634573,0.279489,0.103811,0.176352,7.220891,2.604937,0.468091,6.041857,11.998121,1.987319,1.882475,1.212643,1.013355,1.006986,1.001047,1.052171,1.4e-05,1.126261,1.107957,0.000244,1.075538,1.066444,0.003012,1.000244,4.3e-05,0.00043,0.0,0.0,1.396899,1.007101,0.0001,0.0,2.9e-05,1.4e-05,0.5506,0.761293,1.502517,0.910074,45.125744
std,100420600.0,39496260.0,1.509579,5.049905,4.166744,2.925165,19.904249,1.756569,8.254492,1.063122,0.511988,0.602262,2.001597,0.95149,0.499071,1.597789,17.39209,0.296893,0.540851,0.451291,0.126558,0.087325,0.034489,0.24473,0.003787,0.370447,0.354678,0.015614,0.277964,0.260653,0.05736,0.017355,0.00656,0.02074,0.0,0.0,0.800623,0.085824,0.01002,0.0,0.005356,0.003787,0.497437,0.426297,0.655631,0.286078,27.238263
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,81067700.0,23334260.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0,3.0,0.0,5.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,18.0
50%,143727700.0,48006490.0,1.0,1.0,7.0,3.0,44.0,1.0,14.0,0.0,0.0,0.0,8.0,3.0,0.0,6.0,4.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,62.0
75%,215386100.0,87514980.0,3.0,3.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,0.0,9.0,3.0,1.0,7.0,19.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,70.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,42.0,12.0,16.0,5.0,2.0,9.0,70.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,3.0,3.0,1.0,3.0,3.0,2.0,3.0,1.0,1.0,0.0,0.0,3.0,3.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0,70.0


In [94]:
df_cleaned.loc[:,['time_in_hospital', 'num_lab_procedures', 
                  'num_procedures', 'num_medications', 
                  'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']].groupby(df_cleaned['readmitted_recoded_le']).mean()

Unnamed: 0_level_0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
readmitted_recoded_le,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,4.794226,44.918488,1.424629,16.61174,0.308024,0.149944,0.369118,7.511884
1,4.209192,42.655618,1.42428,15.538018,0.276669,0.099253,0.157304,7.192138


In [96]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'readmitted_recoded', 'medical_specialty_recoded', 'race_le',
       'gender_le', 'age_le', 'medical_specialt

In [140]:
# 63.57% of patients discharged were discharged to home
# the next highest discharge disposition category was discharged/transferred to SNF at a distant 12.8%
display(rp.summary_cat(df_cleaned[['discharge_disposition_id']]))

Unnamed: 0,Variable,Outcome,Count,Percent
0,discharge_disposition_id,1,44317,63.57
1,,3,8784,12.6
2,,6,8289,11.89
3,,18,2474,3.55
4,,2,1539,2.21
5,,22,1410,2.02
6,,5,913,1.31
7,,25,778,1.12
8,,4,541,0.78
9,,7,409,0.59


In [141]:
# 53.25% of admissions came from Trauma Center followed by ED
display(rp.summary_cat(df_cleaned[['admission_source_id']]))


Unnamed: 0,Variable,Outcome,Count,Percent
0,admission_source_id,7,37124,53.25
1,,1,21654,31.06
2,,17,4808,6.9
3,,4,2524,3.62
4,,6,1783,2.56
5,,2,902,1.29
6,,5,506,0.73
7,,20,153,0.22
8,,3,136,0.2
9,,9,95,0.14


In [None]:
display(rp.summary_cat(df_cleaned[['race']]))


In [None]:
display(rp.summary_cat(df_cleaned[['gender']]))


In [1]:
display(rp.summary_cat(df_cleaned[['age']]))

NameError: name 'rp' is not defined

In [139]:
# Suggest combing physician specialties 
df_cleaned.loc[:,['time_in_hospital']].groupby(df_cleaned['medical_specialty'])\
.count().sort_values(by='time_in_hospital', ascending=False)


Unnamed: 0_level_0,time_in_hospital
medical_specialty,Unnamed: 1_level_1
InternalMedicine,10582
Family/GeneralPractice,4955
Emergency/Trauma,4390
Cardiology,4199
Surgery-General,2202
...,...
Proctology,1
Perinatology,1
Neurophysiology,1
Surgery-PlasticwithinHeadandNeck,1


In [91]:
df_cleaned.agg(
    {
    'num_medications':["min", "max", "median", "skew"], 
    'num_lab_procedures':["min", "max", "median", "skew"]
    }
)

Unnamed: 0,num_medications,num_lab_procedures
min,1.0,1.0
max,81.0,132.0
median,14.0,44.0
skew,1.424745,-0.218674
