In [None]:
# Hospital Readmission Prediction - Exploratory Data Analysis
# Goal: Identify high-risk patient segments for Case Management team to priotize before discharge
# Author: Nora Le

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load cleaned data from phase 1
df = pd.read_csv('diabetic_data_cleaned.csv')

Matplotlib is building the font cache; this may take a moment.


In [8]:
# Step 1: Check if the data is clean
print(df.shape)
print("\n") 
print(df.head())
print("\n")
print(df.info())
print("\n")

(101763, 48)


   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No      

In [9]:
print(df.describe())

       encounter_id   patient_nbr  admission_type_id  \
count  1.017630e+05  1.017630e+05      101763.000000   
mean   1.652008e+08  5.432965e+07           2.024017   
std    1.026410e+08  3.869658e+07           1.445414   
min    1.252200e+04  1.350000e+02           1.000000   
25%    8.495975e+07  2.341296e+07           1.000000   
50%    1.523883e+08  4.550049e+07           1.000000   
75%    2.302698e+08  8.754571e+07           3.000000   
max    4.438672e+08  1.895026e+08           8.000000   

       discharge_disposition_id  admission_source_id  time_in_hospital  \
count             101763.000000        101763.000000     101763.000000   
mean                   3.715515             5.754459          4.396018   
std                    5.279919             4.064110          2.985092   
min                    1.000000             1.000000          1.000000   
25%                    1.000000             1.000000          2.000000   
50%                    1.000000             7.00000

In [11]:
# List the column names
print(df.columns)


Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'readmitted'],
      dtype='object')


In [None]:
# Step 2: Understand the Target Variable: Readmitted

In [15]:
# Calculate the readmission distribution + percentage
readmission_count = df['readmitted'].value_counts()
print(readmission_count)

# normalize = True : shortcut to calculate the percentage
print(df['readmitted'].value_counts(normalize=True))
print("\n" + "="*50 + "\n")

readmitted
NO     54861
>30    35545
<30    11357
Name: count, dtype: int64
readmitted
NO     0.539106
>30    0.349292
<30    0.111602
Name: proportion, dtype: float64




In [16]:
# Baseline : 30-day readmission rate (North Star Metric)
baseline_rate = (df['readmitted'] == '<30').sum() / len(df)
print(f"Baseline 30-day readmission rate: {baseline_rate:.2%}")


Baseline 30-day readmission rate: 11.16%


In [17]:
print(df['admission_source_id'])

0         1
1         7
2         7
3         7
4         7
         ..
101758    7
101759    5
101760    7
101761    7
101762    7
Name: admission_source_id, Length: 101763, dtype: int64


In [18]:
print(df['discharge_disposition_id'])

0         25
1          1
2          1
3          1
4          1
          ..
101758     3
101759     4
101760     1
101761     3
101762     1
Name: discharge_disposition_id, Length: 101763, dtype: int64


In [None]:
# Key variables: time_in_hospital, num_medication, num_procedures, and admission_source_id

In [20]:
# Profile time_in_hospital
# describe() : provide statistical summary of data
print("="*50)
print(df['time_in_hospital'].describe())
print(f"Missing values: {df['time_in_hospital'].isnull().sum()}")
print("\n")

count    101763.000000
mean          4.396018
std           2.985092
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          14.000000
Name: time_in_hospital, dtype: float64
Missing vaues: 0


In [23]:
# Profile num_medication
print("="*50)
print(df['num_medications'].describe())
print(f"Missing values: {df['num_medications'].isnull().sum()}")
print("\n")

count    101763.000000
mean         16.021835
std           8.127589
min           1.000000
25%          10.000000
50%          15.000000
75%          20.000000
max          81.000000
Name: num_medications, dtype: float64
Missing vaues: 0




In [24]:
# Profile num_procedures
print("="*50)
print(df['num_procedures'].describe())
print(f"Missing values: {df['num_procedures'].isnull().sum()}")
print("\n")

count    101763.000000
mean          1.339691
std           1.705792
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max           6.000000
Name: num_procedures, dtype: float64
Missing values: 0




In [None]:
# Key takeaway:
# hypothesis: longer stay -> higher readmisison risk
# time_in_hospital: mean - 4.4 days, 50% stays around 4 days 

# hypothesis: high num_medication -> higher readmission risk
# num_medication: mean - 16

# hypothesis: more procedures -> higher risk
# procedures: avg: 1.3

In [None]:
# Calculate all baseline metrics


In [26]:
avg_time_in_hospital = df['time_in_hospital'].mean()
avg_meds = df['num_medications'].mean()
avg_procedures = df['num_procedures'].mean()
print(f"30-day readmission rate: {baseline_rate:.2%}")
print(f"Average time in hospital: {avg_time_in_hospital:.1f}")
print(f"Average medications: {avg_meds:.1f}")
print(f"Average procedures: {avg_procedures:.1f}")

#Note: 
# .2% : formatting for numbers
# .1f : floating point with 1 number


30-day readmission rate: 11.16%
Average time in hospital: 4.4
Average medications: 16.0
Average procedures: 1.3


In [None]:
## Test the hypothesis

In [28]:
# Step 1: Create categories
# df.cut() takes a continuous variable and convert it into bins
# value x : 0 < x <= 3 : go to bin 1 (S (1-3 days))
# 3 < x <= 7 : go to bin 2 (M (4-7 days))
# 7 < x <= 14: go to bin 3 (L (8-14 days))
df['time_in_hospital_category'] = pd.cut(df['time_in_hospital'], 
                                         bins=[0,3,7,14],
                                         labels=['S(1-3 days)', 'M (4-7 days)', 'L (8-14 days)'])

# Calculate readmission rate < 30-day for each category
                                         

In [29]:
df['time_in_hospital_category']

0           S(1-3 days)
1           S(1-3 days)
2           S(1-3 days)
3           S(1-3 days)
4           S(1-3 days)
              ...      
101758      S(1-3 days)
101759     M (4-7 days)
101760      S(1-3 days)
101761    L (8-14 days)
101762     M (4-7 days)
Name: time_in_hospital_category, Length: 101763, dtype: category
Categories (3, object): ['S(1-3 days)' < 'M (4-7 days)' < 'L (8-14 days)']

In [40]:
# Create a groupby object
grouped = df.groupby('time_in_hospital_category', observed=False)

# Count total patients in each group
total_patients = grouped.size()

# Count 30-day readmission in each group
readmission_30day = grouped['readmitted'].apply(lambda x: (x=='<30').sum())

# Calculate the readmission rate for each group
readmission_rate = readmission_30day / total_patients

# Combine to 1 table
time_in_hospital_df = pd.DataFrame({
    'Total Patients': total_patients,
    '30-day Readmissions': readmission_30day,
    'Readmission Rate': readmission_rate
})

print("="*70)
print("TIME IN HOSPITAL ANALYSIS")
print(time_in_hospital_df)

TIME IN HOSPITAL ANALYSIS
                           Total Patients  30-day Readmissions  \
time_in_hospital_category                                        
S(1-3 days)                         49186                 4768   
M (4-7 days)                        37288                 4544   
L (8-14 days)                       15289                 2045   

                           Readmission Rate  
time_in_hospital_category                    
S(1-3 days)                        0.096938  
M (4-7 days)                       0.121862  
L (8-14 days)                      0.133756  


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1397f6270>


In [None]:
# Takeaway from the time_in_hospital 
# Hypothesis: longer stay -> higher readmisison risk ==> TRUE
# Patients with extended hospital stays (8-14 days) show 20% higher readmission risk than baseline.

In [44]:
# Test 2nd Hypothesis: Number of Medications
df['med_category'] = pd.cut(df['num_medications'],
                            bins=[0,10,15,20,100],
                            labels=['Low (1-10)','Medium (11-15)','High (16-20)','Very High (21-100)'])
# Group and Analyze
grouped = df.groupby('med_category',observed=False)
total_patients = grouped.size()
readmissions_30day = grouped['readmitted'].apply(lambda x: (x == '<30').sum())
readmissions_rate = readmissions_30day / total_patients

med_analysis_df = pd.DataFrame({
     'Total Patients': total_patients,
    '30-day Readmissions': readmissions_30day,
    'Readmission Rate': readmissions_rate
})
print("="*70)
print("NUMBER OF MEDICATIONS ANALYSIS")
print(med_analysis_df)

NUMBER OF MEDICATIONS ANALYSIS
                    Total Patients  30-day Readmissions  Readmission Rate
med_category                                                             
Low (1-10)                   25860                 2344          0.090642
Medium (11-15)               29384                 3206          0.109107
High (16-20)                 22641                 2756          0.121726
Very High (21-100)           23878                 3051          0.127775


In [None]:
# Hypothesis: Higher number of medications -> Higher readmisison risk : TRUE
# However, the increase is steady, not dramatic between groups: 9 > 10 > 12.1 > 12.7
# Num of medications count but other factors maybe more important

In [45]:
# Test 3rd Hypothesis: Num of Procedures
df['procedure_category'] = pd.cut(df['num_procedures'], 
                                 bins=[-1,0,1,2,10],
                                 labels=['None (0)','Low(1)','Medium (2)', 'High (3+)'])
proc_grouped = df.groupby('procedure_category', observed=False)
total_patients = proc_grouped.size()
readmissions_30day = proc_grouped['readmitted'].apply(lambda x: (x == '<30').sum())
readmissions_rate = readmissions_30day / total_patients

proc_analysis_df = pd.DataFrame({
    'Total Patients': total_patients,
    '30-day Readmissions': readmissions_30day,
    'Readmission Rate': readmissions_rate
})
print("="*70)
print("NUMBER OF PROCEDURES ANALYSIS")
print(proc_analysis_df)

NUMBER OF PROCEDURES ANALYSIS
                    Total Patients  30-day Readmissions  Readmission Rate
procedure_category                                                       
None (0)                     46652                 5168          0.110778
Low(1)                       20741                 2532          0.122077
Medium (2)                   12716                 1422          0.111828
High (3+)                    21654                 2235          0.103214


In [None]:
# Hypothesis: more procedures -> higher risk : WRONG
# From the analysis, patients with exactly 1 procedure are highest risk (12.21%) 
# while patients with many procedures (3+) are lower risk (10.32%)

In [48]:
# Test how patients enter the system : admission_source_id
# There are many id but we focus on the big areas:
# admission_source_id : 7 : emergency room
# admission_source_id : 1 : physician transfer
# admission_source_id : 17: unknown

In [46]:
print(df['admission_source_id'].value_counts())

admission_source_id
7     57492
1     29564
17     6781
4      3187
6      2264
2      1104
5       855
3       187
20      161
9       125
8        16
22       12
10        8
14        2
11        2
25        2
13        1
Name: count, dtype: int64


In [50]:
grouped = df.groupby('admission_source_id')
total_patients = grouped.size()
readmission_30day = grouped['readmitted'].apply(lambda x: (x == '<30').sum())
readmission_rate = readmission_30day / total_patients

admission_analysis = pd.DataFrame({
     'Total Patients': total_patients,
    '30-day Readmissions': readmission_30day,
    'Readmission Rate': readmission_rate
})

print("="*70)
print("Admission Source ANALYSIS")
print(admission_analysis)

Admission Source ANALYSIS
                     Total Patients  30-day Readmissions  Readmission Rate
admission_source_id                                                       
1                             29564                 3130          0.105872
2                              1104                  111          0.100543
3                               187                   29          0.155080
4                              3187                  309          0.096956
5                               855                  101          0.118129
6                              2264                  212          0.093640
7                             57492                 6720          0.116886
8                                16                    2          0.125000
9                               125                   13          0.104000
10                                8                    0          0.000000
11                                2                    0          0.000000

In [52]:
# Takeaway: 
# Admission_source_id 7 (emergency) are slighly riskier than 
# planned admissions (admission_source_id : 1)
# Source 3 is a hidden high-risk group (15.51% readmission rate)
# Admission Source has minor effect overall

In [None]:
# Test discharge_disposition_id
# 1: home; 3: nursing facility; 6: home with health service

In [53]:
print(df['discharge_disposition_id'].value_counts())

discharge_disposition_id
1     60232
3     13954
6     12902
18     3691
2      2128
22     1992
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: count, dtype: int64


In [55]:
grouped = df.groupby('discharge_disposition_id')
total_patients = grouped.size()
readmission_30day = grouped['readmitted'].apply(lambda x: (x == '<30').sum())
readmission_rate = readmission_30day / total_patients

discharge_analysis = pd.DataFrame({
    'Total Patients': total_patients,
    '30-day Readmissions': readmission_30day,
    'Readmission Rate': readmission_rate
})

print("="*70)
print("DISCHARGE ANALYSIS")
print(discharge_analysis)

DISCHARGE ANALYSIS
                          Total Patients  30-day Readmissions  \
discharge_disposition_id                                        
1                                  60232                 5602   
2                                   2128                  342   
3                                  13954                 2046   
4                                    815                  104   
5                                   1184                  247   
6                                  12902                 1638   
7                                    623                   90   
8                                    108                   15   
9                                     21                    9   
10                                     6                    0   
11                                  1642                    0   
12                                     3                    2   
13                                   399                   19   
14    

In [56]:
# Filter for just the big 3
big_3 = df[df['discharge_disposition_id'].isin([1, 3, 6])]

# Analyze each
grouped = big_3.groupby('discharge_disposition_id')
total_patients = grouped.size()
readmissions_30day = grouped['readmitted'].apply(lambda x: (x == '<30').sum())
readmission_rate = readmissions_30day / total_patients

big3_analysis = pd.DataFrame({
    'Total Patients': total_patients,
    '30-day Readmissions': readmissions_30day,
    'Readmission Rate': readmission_rate
})

print("="*70)
print("BIG 3 DISCHARGE DESTINATIONS")
print("="*70)
print("1 = Home (no services)")
print("3 = Skilled Nursing Facility")
print("6 = Home with Health Services")
print("="*70)
print(big3_analysis)
print("\n")
print(f"Baseline rate: {baseline_rate:.2%}")
print("="*70)

# Calculate differences
for disp_id in [1, 3, 6]:
    rate = big3_analysis.loc[disp_id, 'Readmission Rate']
    vs_baseline = ((rate / baseline_rate) - 1) * 100
    print(f"Disposition {disp_id}: {rate:.2%} ({vs_baseline:+.1f}% vs baseline)")

BIG 3 DISCHARGE DESTINATIONS
1 = Home (no services)
3 = Skilled Nursing Facility
6 = Home with Health Services
                          Total Patients  30-day Readmissions  \
discharge_disposition_id                                        
1                                  60232                 5602   
3                                  13954                 2046   
6                                  12902                 1638   

                          Readmission Rate  
discharge_disposition_id                    
1                                 0.093007  
3                                 0.146625  
6                                 0.126957  


Baseline rate: 11.16%
Disposition 1: 9.30% (-16.7% vs baseline)
Disposition 3: 14.66% (+31.4% vs baseline)
Disposition 6: 12.70% (+13.8% vs baseline)


In [None]:
# Takeaway:
# The counterintuitive finding: Patients sent to skilled nursing facilities (code: 3) have higher 
# readmission rates than those sent home alone (code 1,6)
# This affects 13954 patients 

In [57]:
### Best Finding: 1 Procedure Paradox
# Most people assume more procedures = more risk.
# The analysis showed that people with exact 1 procedure have higher chance of readmission
# than those who don't have any or have more than 1

In [66]:
### Validate The Finding
print("="*70)
print("VALIDATION: 1 PROCEDURE PARADOX")
print("="*70)

# Break the segment down
none_proc = df[df['num_procedures'] == 0]
one_proc =  df[df['num_procedures'] == 1]
two_proc =  df[df['num_procedures'] == 2]
three_plus_proc = df[df['num_procedures'] > 3]
# name each segment
segments = {
    '0 proc': none_proc,
    '1 proc': one_proc,
    '2 proc': two_proc,
    '3+ proc': three_plus_proc
}
# calculate the readmision and readmission rate
for name, segment in segments.items():
    size = len(segment)
    readmission = (segment['readmitted'] == '<30').sum()
    rate = readmission / size
    total_pct = (size / len(df)) * 100
    seg_readmission_pct = (readmission) / (df['readmitted'] == '<30').sum() * 100

    print(f"\n{name}:")
    print(f" Patients : {size:,}({total_pct:.1f}% of total)")
    print(f" Readmissions: {readmission:,}({seg_readmission_pct:.1f}% of all readmissions)")
    print(f" Readmission rate: {rate:.2%}")
    print(f" vs Baseline: {((rate/baseline_rate - 1) * 100):+.1f}%")


VALIDATION: 1 PROCEDURE PARADOX

0 proc:
 Patients : 46,652(45.8% of total)
 Readmissions: 5,168(45.5% of all readmissions)
 Readmission rate: 11.08%
 vs Baseline: -0.7%

1 proc:
 Patients : 20,741(20.4% of total)
 Readmissions: 2,532(22.3% of all readmissions)
 Readmission rate: 12.21%
 vs Baseline: +9.4%

2 proc:
 Patients : 12,716(12.5% of total)
 Readmissions: 1,422(12.5% of all readmissions)
 Readmission rate: 11.18%
 vs Baseline: +0.2%

3+ proc:
 Patients : 12,211(12.0% of total)
 Readmissions: 1,226(10.8% of all readmissions)
 Readmission rate: 10.04%
 vs Baseline: -10.0%


In [72]:
# Key insight
print("\n" + "="*70)
print("KEY INSIGHT:")
print("="*70)
one_proc_rate = (one_proc['readmitted']== '<30').sum() / len(one_proc)
three_plus_proc_rate = (three_plus_proc['readmitted']== '<30').sum() / len(three_plus_proc)
difference = ((one_proc_rate - three_plus_proc_rate) / three_plus_proc_rate )*100

print(f"Patients with 1 procedure have {difference:.1f}% HIGHER readmission risk")
print(f"than patients with 3+ procedures ({one_proc_rate:.2%} vs {three_plus_proc_rate:.2%})")
print(f"This affects {len(one_proc):,} patients")


KEY INSIGHT:
Patients with 1 procedure have 21.6% HIGHER readmission risk
than patients with 3+ procedures (12.21% vs 10.04%)
This affects 20,741 patients


In [76]:
### EDA Summary
print("="*70)
print("EDA PHASE COMPLETE - SUMMARY OF FINDINGS")
print("="*70)
print(f"\nBaseline 30-day readmission rate: {baseline_rate:.2%}")
print(f"Total patients analyzed: {len(df):,}")
print(f"Total 30-day readmissions: {(df['readmitted'] == '<30').sum():,}")

print("\n" + "="*70)
print("HYPOTHESIS TESTING RESULTS")
print("="*70)

print("\n HYPOTHESIS 1: Length of Stay")
print("   Long stays (8-14 days): 13.38% readmission rate")
print("   Result: CONFIRMED - 20% higher than baseline")

print("\n HYPOTHESIS 2: Number of Medications")  
print("   High medications (21+): 12.78% readmission rate")
print("   Result: CONFIRMED - Gradual increase with med count")

print("\n HYPOTHESIS 3: Number of Procedures")
print("   Result: INVERSE PATTERN DISCOVERED")
print("   1 procedure: 12.21% (HIGHEST risk)")
print("   3+ procedures: 10.04% (LOWEST risk)")

print("\n ADDITIONAL: Discharge Disposition")
print("   SNF discharge: 14.66% (31% above baseline)")
print("   Home alone: 9.30% (17% below baseline)")

print("\n" + "="*70)
print("PRIMARY FINDING")
print("\n THE 1 PROCEDURE PARADOX")
print("   Single-procedure patients show 21.6% higher readmission risk")
print("   than multi-procedure patients (12.21% vs 10.04%)")
print("   Segment: 20,741 patients (20% of dataset)")
print("   Classification: Directional insight requiring investigation")
print("   Recommendation: Enhanced discharge planning for moderate-complexity patients")
print("="*70)

EDA PHASE COMPLETE - SUMMARY OF FINDINGS

Baseline 30-day readmission rate: 11.16%
Total patients analyzed: 101,763
Total 30-day readmissions: 11,357

HYPOTHESIS TESTING RESULTS

 HYPOTHESIS 1: Length of Stay
   Long stays (8-14 days): 13.38% readmission rate
   Result: CONFIRMED - 20% higher than baseline

 HYPOTHESIS 2: Number of Medications
   High medications (21+): 12.78% readmission rate
   Result: CONFIRMED - Gradual increase with med count

 HYPOTHESIS 3: Number of Procedures
   Result: INVERSE PATTERN DISCOVERED
   1 procedure: 12.21% (HIGHEST risk)
   3+ procedures: 10.04% (LOWEST risk)

 ADDITIONAL: Discharge Disposition
   SNF discharge: 14.66% (31% above baseline)
   Home alone: 9.30% (17% below baseline)

PRIMARY FINDING

 THE 1 PROCEDURE PARADOX
   Single-procedure patients show 21.6% higher readmission risk
   than multi-procedure patients (12.21% vs 10.04%)
   Segment: 20,741 patients (20% of dataset)
   Classification: Directional insight requiring investigation
   Re