In [713]:
# Import dependencies 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import collections
import researchpy as rp
from sklearn.preprocessing import LabelEncoder
import re
import os
import matplotlib.pyplot as plt

## Add code here to connect to DB to read raw data

In [714]:
# DB code goes here

In [715]:
pd.set_option('display.max_columns', None)

# df = pd.read_csv("diabetic_data_initial.csv")

dataset = '../database/diabetic_data_initial.csv'
df = pd.read_csv(dataset)
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [716]:
# Show columns with missing greater than 20%
for column in df:
    if df[df[column]=='?'].shape[0]/df.shape[0]*100 > 20:
        print(column,":",str(round(df[df[column]=='?'].shape[0]/df.shape[0]*100)) + "%")              

weight : 97%
payer_code : 40%
medical_specialty : 49%


In [717]:
# variables weight and payer_code were excluded due to quantity of missing data
# medical specialty was recoded to add "missing" for the missing values.

In [718]:
# drop weight and payer_code columns 
df_copy = df.copy()
df_copy.drop(columns=['weight', 'payer_code'], axis=1, inplace=True)

In [719]:
# In order to keep observations independent, only one the first encounter is included
# Dedupe based on first encounter
df_deduped = df_copy.drop_duplicates(subset=['patient_nbr'], keep='first')

In [720]:
# Remove encounters that resulted in either discharge to 
# a hospice or patient death to avoid biasing analysis 
discharge_disposition_excluded=[11, 13, 14, 19, 20, 23]

df_cleaned = df_deduped[~df_deduped.discharge_disposition_id.isin(discharge_disposition_excluded)]

In [721]:
df_cleaned['gender'].value_counts()

Female             37084
Male               32626
Unknown/Invalid        3
Name: gender, dtype: int64

In [722]:
df_cleaned.drop(df_cleaned[(df_cleaned.loc[:,'gender'] == "Unknown/Invalid")].index, inplace = True)
df_cleaned['gender'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Female    37084
Male      32626
Name: gender, dtype: int64

In [723]:
# recode readmitted to be binary 
# def recode_readmit(x):
#     if x == '<30':
#         return '<30'
#     else:
#         return 'NO'
    
# df_cleaned['readmitted_recoded'] = df_cleaned[df_cleaned.loc[:,'readmitted']].apply(recode_readmit)
df_cleaned['readmitted_recoded'] = df_cleaned.loc[:,['readmitted']].replace({'>30': 'NO'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [724]:
# recode medical_specialty to add missing 
df_cleaned['medical_specialty_recoded'] = df_cleaned.loc[:,['medical_specialty']].replace("?",'missing', inplace=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [725]:
# readmitted recoded is imbalanced with 91% of cases 
# not having a 30 day readmission
print(collections.Counter(df_cleaned['readmitted_recoded']))

Counter({'NO': 63441, '<30': 6269})


In [726]:
df_cleaned['medical_specialty_recoded'].value_counts()

missing                             33507
InternalMedicine                    10582
Family/GeneralPractice               4955
Emergency/Trauma                     4390
Cardiology                           4198
                                    ...  
Resident                                1
SportsMedicine                          1
Psychiatry-Addictive                    1
Dermatology                             1
Surgery-PlasticwithinHeadandNeck        1
Name: medical_specialty_recoded, Length: 71, dtype: int64

In [727]:
df_cleaned['medical_specialty_recoded'] = df_cleaned.loc[:,['medical_specialty_recoded']].replace({'Family/GeneralPractice': 'InternalMedicine'})

def values_to_other(col_name, value_unchanged):
    v=[]
    for value in col_name:
        if (value != value_unchanged):
            v.append("Other")
        else:
            v.append(value)
    return v
    

df_cleaned['medical_specialty_recoded'] = values_to_other(df_cleaned['medical_specialty_recoded'].values,"InternalMedicine")    



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [728]:
df_cleaned['medical_specialty_recoded'].value_counts()

Other               54173
InternalMedicine    15537
Name: medical_specialty_recoded, dtype: int64

In [729]:
# readmitted recoded is imbalanced with 91% of cases 
# not having a 30 day readmission
print(collections.Counter(df_cleaned['readmitted_recoded']))

Counter({'NO': 63441, '<30': 6269})


In [730]:
# function to clean 'age' column
def parse_age_range(age_col):
    c=[]
    for values in age_col:
        s = re.sub('[[)]','', values)
        c.append(s)
    return c

# replace 'age' values with cleaned values
df_cleaned['age'] = parse_age_range(df_cleaned['age'].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [731]:
df_cleaned = df_cleaned.drop(columns=['readmitted', 'medical_specialty'])

In [732]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [733]:
df_cleaned.dtypes

encounter_id                  int64
patient_nbr                   int64
race                         object
gender                       object
age                          object
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
diag_1                       object
diag_2                       object
diag_3                       object
number_diagnoses              int64
max_glu_serum                object
A1Cresult                    object
metformin                    object
repaglinide                  object
nateglinide                  object
chlorpropamide               object
glimepiride                  object
acetohexamide                object
glipizide                   

# DB code goes here

In [734]:
# DB code to export cleaned dataset to DB

In [735]:
df_cleaned.to_csv('../database/diabetes_dataset_cleaned.csv')

# Data Pre-Processing

# DB code goes here

In [736]:
# DB code to read cleaned dataset from DB

In [737]:
# Create a list of columns to encode for each variable if variable type is object
columns_to_encode = [column for column in df_cleaned.columns if df_cleaned[column].dtypes == 'O']


In [738]:
print(columns_to_encode)

['race', 'gender', 'age', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded', 'medical_specialty_recoded']


In [739]:
# exclude diag 1, 2, and 3 from recode list 
for i in range(3):
    to_remove = 'diag_' + str(i+1)
    columns_to_encode.remove(to_remove)
print(columns_to_encode)
print("\n", len(columns_to_encode), "columns")

['race', 'gender', 'age', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded', 'medical_specialty_recoded']

 32 columns


In [740]:
encoded_df = df_cleaned.copy()

In [741]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [742]:
# function to apply label encoding
def apply_encoder(cols):
    le = LabelEncoder()
    for c in cols:
        new_column_name = c + "_le"
        le.fit(df_cleaned[c])
        encoded_df[new_column_name] = le.transform(encoded_df[c])
        print(c, ":", le.classes_)

In [743]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [744]:
# Apply Encoding
apply_encoder(columns_to_encode)

race : ['?' 'AfricanAmerican' 'Asian' 'Caucasian' 'Hispanic' 'Other']
gender : ['Female' 'Male']
age : ['0-10' '10-20' '20-30' '30-40' '40-50' '50-60' '60-70' '70-80' '80-90'
 '90-100']
max_glu_serum : ['>200' '>300' 'None' 'Norm']
A1Cresult : ['>7' '>8' 'None' 'Norm']
metformin : ['Down' 'No' 'Steady' 'Up']
repaglinide : ['Down' 'No' 'Steady' 'Up']
nateglinide : ['Down' 'No' 'Steady' 'Up']
chlorpropamide : ['Down' 'No' 'Steady' 'Up']
glimepiride : ['Down' 'No' 'Steady' 'Up']
acetohexamide : ['No' 'Steady']
glipizide : ['Down' 'No' 'Steady' 'Up']
glyburide : ['Down' 'No' 'Steady' 'Up']
tolbutamide : ['No' 'Steady']
pioglitazone : ['Down' 'No' 'Steady' 'Up']
rosiglitazone : ['Down' 'No' 'Steady' 'Up']
acarbose : ['No' 'Steady' 'Up']
miglitol : ['Down' 'No' 'Steady' 'Up']
troglitazone : ['No' 'Steady']
tolazamide : ['No' 'Steady']
examide : ['No']
citoglipton : ['No']
insulin : ['Down' 'No' 'Steady' 'Up']
glyburide-metformin : ['Down' 'No' 'Steady' 'Up']
glipizide-metformin : ['No' 'Stea

In [745]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [746]:
encoded_df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded',
       'medical_specialty_recoded', 'race_le', 'gender_le', 'age_le',
       'max_glu_serum_le', 'A1Cresult_le', 'metformin_le', 'repagl

In [747]:
cols_to_drop = ['race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded',
       'medical_specialty_recoded']
encoded_df = encoded_df.drop(columns=cols_to_drop)

In [748]:
print(len(encoded_df.columns),"columns")

34 columns


In [683]:
# Set ? to NaN
encoded_df.replace('?', np.nan, inplace=True)

In [684]:
# Show missing in dataset 
for column in df_cleaned.columns:
    missing_count = df_cleaned[column].isnull().sum()
    if missing_count>0:
        print(column,":",df_cleaned)        

# Machine Learning Model: LogisticRegression

In [685]:
y = encoded_df["readmitted_recoded_le"]
X = encoded_df.drop(columns=['readmitted_recoded_le'])

In [686]:
from sklearn.model_selection import train_test_split

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [687]:
# Define the logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=500, random_state=1)

In [688]:
# Train the model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=1)

In [689]:
# Evaluate the model
y_pred = classifier.predict(X_test)

In [690]:
d = {'Predicted': y_pred, 'Actual': y_test}
check_df = pd.DataFrame(data=d)
check_df.head()

Unnamed: 0,Predicted,Actual
5752,1,1
96945,1,1
25492,1,1
33015,1,1
17955,1,1


In [691]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9100872159742942


# Summary Statistics: EDA

In [631]:
display(df_cleaned.describe())

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0,69710.0
mean,156560100.0,54941460.0,2.107761,3.339334,5.638244,4.261842,42.859489,1.424258,15.634543,0.279501,0.103816,0.176359,7.220944
std,100421500.0,39496630.0,1.509596,5.049503,4.16679,2.925141,19.904059,1.756553,8.254529,1.063143,0.511998,0.602274,2.001574
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,81066990.0,23333570.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,143726800.0,48001960.0,1.0,1.0,7.0,3.0,44.0,1.0,14.0,0.0,0.0,0.0,8.0
75%,215385900.0,87514310.0,3.0,3.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,0.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,42.0,12.0,16.0


In [632]:
df_cleaned.loc[:,['time_in_hospital', 'num_lab_procedures', 
                  'num_procedures', 'num_medications', 
                  'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']].groupby(df_cleaned['readmitted_recoded']).mean()

Unnamed: 0_level_0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
readmitted_recoded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
<30,4.794226,44.918488,1.424629,16.61174,0.308024,0.149944,0.369118,7.511884
NO,4.209234,42.656027,1.424221,15.53798,0.276682,0.099258,0.157312,7.192194


In [633]:
df_cleaned.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted_recoded',
       'medical_specialty_recoded'],
      dtype='object')

In [635]:
# 63.57% of patients discharged were discharged to home
# the next highest discharge disposition category was discharged/transferred to SNF at a distant 12.8%
display(rp.summary_cat(df_cleaned[['discharge_disposition_id']]))

Unnamed: 0,Variable,Outcome,Count,Percent
0,discharge_disposition_id,1,44315,63.57
1,,3,8784,12.6
2,,6,8289,11.89
3,,18,2474,3.55
4,,2,1539,2.21
5,,22,1409,2.02
6,,5,913,1.31
7,,25,778,1.12
8,,4,541,0.78
9,,7,409,0.59


In [637]:
# 53.25% of admissions came from Trauma Center followed by ED
display(rp.summary_cat(df_cleaned[['admission_source_id']]))


Unnamed: 0,Variable,Outcome,Count,Percent
0,admission_source_id,7,37122,53.25
1,,1,21653,31.06
2,,17,4808,6.9
3,,4,2524,3.62
4,,6,1783,2.56
5,,2,902,1.29
6,,5,506,0.73
7,,20,153,0.22
8,,3,136,0.2
9,,9,95,0.14


In [638]:
display(rp.summary_cat(df_cleaned[['race']]))


Unnamed: 0,Variable,Outcome,Count,Percent
0,race,Caucasian,52117,74.76
1,,AfricanAmerican,12550,18.0
2,,?,1912,2.74
3,,Hispanic,1499,2.15
4,,Other,1144,1.64
5,,Asian,488,0.7


In [639]:
display(rp.summary_cat(df_cleaned[['gender']]))


Unnamed: 0,Variable,Outcome,Count,Percent
0,gender,Female,37084,53.2
1,,Male,32626,46.8


In [640]:
display(rp.summary_cat(df_cleaned[['age']]))

Unnamed: 0,Variable,Outcome,Count,Percent
0,age,70-80,17673,25.35
1,,60-70,15621,22.41
2,,50-60,12320,17.67
3,,80-90,11046,15.85
4,,40-50,6809,9.77
5,,30-40,2692,3.86
6,,90-100,1743,2.5
7,,20-30,1119,1.61
8,,10-20,534,0.77
9,,0-10,153,0.22


In [235]:
# Suggest combing physician specialties 
# df_cleaned.loc[:,['time_in_hospital']].groupby(df_cleaned['medical_specialty'])\
# .count().sort_values(by='time_in_hospital', ascending=False)


In [641]:
df_cleaned.agg(
    {
    'num_medications':["min", "max", "median", "skew"], 
    'num_lab_procedures':["min", "max", "median", "skew"]
    }
)

Unnamed: 0,num_medications,num_lab_procedures
min,1.0,1.0
max,81.0,132.0
median,14.0,44.0
skew,1.42481,-0.218656
