# DSCI 633 Project
### Predicting Readmittance within 30 Days for Hospitalized Diabetes Patients  
*Michael Eaton, Rochester Institute of Technology — Fall 2025*


## Feature Exploration
Take a look at different columns, explore feature importance, check missing data from columns

In [None]:
from ucimlrepo import fetch_ucirepo 
import re
import numpy as np
import pandas as pd

In [2]:
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X: pd.DataFrame = diabetes_130_us_hospitals_for_years_1999_2008.data.features  # type: ignore
y :pd.DataFrame = diabetes_130_us_hospitals_for_years_1999_2008.data.targets  # type: ignore


  df = pd.read_csv(data_url)


In [3]:
# drop weight and payer_code missing and un informative (Strack et al. (2014))


X_0 = X.drop(columns=['weight', 'payer_code'])



# fix diag column types
X_0['diag_1'] = X_0['diag_1'].astype('string')
X_0['diag_2'] = X_0['diag_2'].astype('string')
X_0['diag_3'] = X_0['diag_3'].astype('string')

# fix specialty column type
X_0['medical_specialty'] = X_0['medical_specialty'].astype('string')

# fix race
X_0['race'] = X_0['race'].astype('string')

# fix age
X_0['age'] = X_0['age'].astype('string')


I'll start by copying the binning strategy used in Strack et al. to limit feature explosion

In [None]:
# bin diagnoses into groups 
_icd_num_re = re.compile(r'^(\d{3})(?:\.\d+)?$')   # e.g., '250.13' -> '250'

def _parse_icd9(code):
    """
    Returns (prefix, num) where:
      - prefix is 'E', 'V', or '' for numeric codes
      - num is an integer 3-digit number if numeric; else None
    """
    if code is None or (isinstance(code, float) and np.isnan(code)):
        return '', None
    s = str(code).strip()
    if not s:
        return '', None
    first = s[0].upper()
    if first in ('E', 'V'):
        return first, None
    m = _icd_num_re.match(s)
    if m:
        return '', int(m.group(1))
    # Try float -> int of floor integer part
    try:
        return '', int(float(s))
    except Exception:
        return '', None

def icd9_to_group(code):
    """
    Map a single ICD-9 code to the study's diagnosis group.
    """
    prefix, num = _parse_icd9(code)

    # External causes (E or V) 
    if prefix in ('E', 'V'):
        return 'Other'

    if num is None:
        return 'Other'

    # Special case: Diabetes 250.xx
    if 250 <= num <= 250:
        return 'Diabetes'

    # Primary named groups
    if (390 <= num <= 459) or (num == 785):
        return 'Circulatory'
    if (460 <= num <= 519) or (num == 786):
        return 'Respiratory'
    if (520 <= num <= 579) or (num == 787):
        return 'Digestive'
    if 800 <= num <= 999:
        return 'Injury'
    if 710 <= num <= 739:
        return 'Musculoskeletal'
    if (580 <= num <= 629) or (num == 788):
        return 'Genitourinary'
    if 140 <= num <= 239:
        return 'Neoplasms'

    return 'Other'

In [5]:
# Admission source -> 3 bins
def bin_admit_source(id: int) -> str:
    if id == 7:
        return 'emergency'
    if id == 1 or id == 2:
        return 'refer'
    return 'other'


In [6]:
# Medical Specialty of admitting physician -> 6 bins including missing
def bin_medical_specialty(value: str) -> str:
    if pd.isna(value) or value in ("Missing", "Unknown", "PhysicianNotFound", "OutreachServices", "DCPTEAM"):
        return "Missing"
    
    # normalize casing and spacing just in case
    val = str(value).strip().lower()
    
    # Internal Medicine
    if "internal" in val:
        return "Internal Medicine"
    
    # Cardiology
    if "cardio" in val:
        return "Cardiology"
    
    # Surgery (catch-all for surgical specialties)
    if "surg" in val or "orthopedic" in val or "urology" in val or "gyneco" in val or "neuro" in val or "vascular" in val or "thoracic" in val:
        return "Surgery"
    
    # Family / General Practice
    if "family" in val or "general" in val or "gp" in val or "obstetric" in val or "pediatr" in val:
        return "Family/GP"
    
    # Everything else
    return "Other"


In [7]:
# Discharge Dispostion -> 2 bins (home, other)
def bin_discharge(id: int) -> str:
    return 'home' if id == 1 else 'other'

In [8]:
# bin race -> Hispanic/Asian -> other since they have few values
def bin_race(race: str) -> str:
    if (pd.isna(race)):
        return "Other"
    race = race.strip().lower()
    if race == "caucasian":
        return "Caucasian"
    if race == "africanamerican":
        return "AfricanAmerican"
    return "Other"

In [9]:
# bin admission type id 
def bin_admit_type(id: int) -> str:
    if id in [5, 6, 8]: # Not Available, NULL, Not mapped
        return 'NA'
    if id in [1,7]: # Emergency, Trauma Center
        return 'Emergency'
    if id == 2:
        return 'Urgent'
    if id == 3:
        return 'Elective'
    return 'Other'


In [10]:
# bin age -> 3 groups
def bin_age(age: str) -> str:
    # grabs [0-10], [10-20], [20-30]
    if ('10' in age or '20' in age):
        return '<30'
    # grabs (30-40), (40-50), (50-60)
    if ('40' in age or '50' in age):
        return '30-60'
    return '>60'

In [11]:
# bin a1c results
def bin_a1c(val: str) -> str:
    if pd.isna(val):
        return 'No_test'
    if val in ('>7', '>8'):
        return 'High'
    if val.lower().startswith('norm'):
        return 'Normal'
    return 'No_test'

In [12]:
def bin_glucose(val: str) -> str:
    if pd.isna(val):
        return 'No_test'
    if val in ('>200', '>300'):
        return 'High'
    if val.lower().startswith('norm'):
        return 'Normal'
    return 'No_test'

Apply the categorical transformations

In [13]:
X_0['diag1_group'] = X_0['diag_1'].apply(icd9_to_group)
X_0['diag2_group'] = X_0['diag_2'].apply(icd9_to_group)
X_0['diag3_group'] = X_0['diag_3'].apply(icd9_to_group)
X_0['admission_source'] = X_0['admission_source_id'].apply(bin_admit_source)
X_0['discharge_loc'] = X_0['discharge_disposition_id'].apply(bin_discharge)
X_0['specialty_cat'] = X_0['medical_specialty'].apply(bin_medical_specialty)
X_0['race_cat'] = X_0['race'].apply(bin_race)
X_0['age_group'] = X_0['age'].apply(bin_age)
X_0['a1c_group'] = X_0['A1Cresult'].apply(bin_a1c)
X_0['glucose_group'] = X_0['max_glu_serum'].apply(bin_glucose)
X_0['admit_type_group'] = X_0['admission_type_id'].apply(bin_admit_type)

Drop the old columns

In [14]:

# Drop rows with invalid gender
valid_gender = (X_0['gender'] == 'Male') | (X_0['gender'] == 'Female')

X_1 = X_0.loc[valid_gender].reset_index(drop=True)
y   = y.loc[valid_gender].reset_index(drop=True)


# drop previous columns in favor of new groups
X_1 = X_1.drop(
    columns=['diag_1', 
             'diag_2', 
             'diag_3', 
             'admission_source_id',
             'discharge_disposition_id',
             'medical_specialty',
             'race',
             'age',
             'A1Cresult',
             'max_glu_serum',
             'admission_type_id'
             ])

One hot encode the new categories

In [15]:
# one hot categorical columns
X_2 = pd.get_dummies(X_1, 
                     columns=[  'diag1_group', 
                                'diag2_group', 
                                'diag3_group', 
                                'admission_source',
                                'discharge_loc',
                                'specialty_cat',
                                'race_cat',
                                'age_group',
                                'gender',
                                'a1c_group',
                                'glucose_group',
                                'admit_type_group'
                              ])

print(f'num features before: {len(X_1.columns)}')
print(f'num features after: {len(X_2.columns)}')




num features before: 45
num features after: 90


Handle drug columns by swapping them to binary flags

In [17]:
# swap to binary flags
X_2['diabetesMed_flag'] = (X_2['diabetesMed'].str.lower() == 'yes').astype(int)
X_2['change_flag'] = (X_2['change'].str.lower() == 'ch').astype(int)


# dump frame before we start dropping columns, for potential deep models if used
X_2.to_csv('../data/full_frame.csv')


cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
    'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
    'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
    'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
    'glipizide-metformin', 'glimepiride-pioglitazone',
    'metformin-rosiglitazone', 'metformin-pioglitazone'
]


for col in cols:
    X_2[f'{col}_flag'] = (X_2[col].str.lower() != 'no').astype(int)


In [18]:
# sum across medicine for amount of medicine prescribed
X_2['num_drugs'] = X_2[[f'{c}_flag' for c in cols]].sum(axis=1)

Drop one base category from each one hot group to avoid linear depedent columns

In [None]:
# for regression models, drop change and diabetesMed since this info is encoded into the drug flags
X_3 = X_2.drop(
    columns=[
        'diag1_group_Circulatory', # Circulatory primary diagnosis
        'specialty_cat_Internal Medicine', # internal medicine physician specialty
        'age_group_30-60', # mid age group
        'race_cat_Caucasian',
        'gender_Male',
        'discharge_loc_home',
        'admission_source_other',
        'a1c_group_No_test',
        'glucose_group_No_test',
        'change',
        'diabetesMed'
    ]
)

# Create the list of flag columns
flag_cols = [f"{c}_flag" for c in cols]
# Drop them from the DataFrame
X_3 = X_3.drop(columns=flag_cols)

# drop drug columns
X_3 = X_3.drop(
    columns=cols
)
print(f'num features after drops: {len(X_3.columns)}')
X_3.to_csv('../data/frame_no_interactions.csv')


num features after drops: 59


['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'diag1_group_Diabetes',
 'diag1_group_Digestive',
 'diag1_group_Genitourinary',
 'diag1_group_Injury',
 'diag1_group_Musculoskeletal',
 'diag1_group_Neoplasms',
 'diag1_group_Other',
 'diag1_group_Respiratory',
 'diag2_group_Circulatory',
 'diag2_group_Diabetes',
 'diag2_group_Digestive',
 'diag2_group_Genitourinary',
 'diag2_group_Injury',
 'diag2_group_Musculoskeletal',
 'diag2_group_Neoplasms',
 'diag2_group_Other',
 'diag2_group_Respiratory',
 'diag3_group_Circulatory',
 'diag3_group_Diabetes',
 'diag3_group_Digestive',
 'diag3_group_Genitourinary',
 'diag3_group_Injury',
 'diag3_group_Musculoskeletal',
 'diag3_group_Neoplasms',
 'diag3_group_Other',
 'diag3_group_Respiratory',
 'admission_source_emergency',
 'admission_source_refer',
 'discharge_loc_other',
 'specialty_cat_Cardiology',
 'specialty_cat_Family/GP',
 

### Create interaction terms
Baseline will be the interaction terms that were found statistically signifigant in Strack et al.

That is (by P):
- Discharge disposition × Time in hospital (p < 0.001)
- Medical specialty of admitting physician × Age (p < 0.001)
- Primary diagnosis × Time in hospital (p < 0.001)
- Discharge disposition × Race (p < 0.001)
- Discharge disposition × Medical specialty of admitting physician (p = 0.001)
- Medical Specialty x Time in hospitcal (p = 0.001)

I'll leave out for now:
- Discharge disposition x Primary diagnosis (p = 0.005)
- Primary diagnosis x HbA1c (p = 0.004)


Categorical x numerical interactions

In [20]:
# Primary Diagnosis X Time in hospital interaction
X_4 = X_3.copy()
for col in X_4.filter(like='diag1_group_'):
    X_4[f'hosp_time_{col}'] = X_4[col] * X_4['time_in_hospital']

# medical specialty of admitting physician x time in hospital
for col in X_4.filter(like='specialty_cat_'):
    X_4[f'hosp_time_{col}'] = X_4[col] * X_4['time_in_hospital']

# Discharge Disposition x time in hospital
for col in X_4.filter(regex=r'^discharge_loc_'):
    X_4[f'discharge_loc_{col}'] = X_4[col] * X_4['time_in_hospital']



print(f'num features after adding numerical interaction cols: {len(X_4.columns)}')

num features after adding numerical interaction cols: 73


Categorical x Categorical interactions from one hot encoded columns

In [21]:
def add_dummy_interactions(df, left_cols, right_cols, prefix=''):
    out = df.copy()
    L = [c for c in left_cols  if c in out.columns]
    R = [c for c in right_cols if c in out.columns]
    for lc in L:
        for rc in R:
            out[f'{prefix}_{lc}__{rc}'] = out[lc] * out[rc]

    return out

In [22]:
# setup column lists
specialty_cols = X_4.filter(regex=r'^specialty_cat_').columns.to_list()
age_cols = X_4.filter(regex=r'^age_group_').columns.to_list()
diag_cols = X_4.filter(regex=r'^diag1_group_').columns.to_list()
race_cols = X_4.filter(regex=r'^race_cat_').columns.to_list()
discharge_cols = X_4.filter(regex=r'discharge_loc_').columns.to_list()

In [24]:
# medical specialty of admitting physician x age bin

X_4 = add_dummy_interactions(
    X_4,
    left_cols=specialty_cols,
    right_cols=age_cols
)

# discharge dispo x race
X_4 = add_dummy_interactions(
    X_4,
    left_cols= discharge_cols,
    right_cols=race_cols
)

# discharge x specialty

X_4 = add_dummy_interactions(
    X_4,
    left_cols=discharge_cols,
    right_cols=specialty_cols
)

print(f'num features after adding categorical interaction cols: {len(X_4.columns)}')


num features after adding categorical interaction cols: 97
