In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [2]:
from ucimlrepo import fetch_ucirepo
diabetes_health_indicators = fetch_ucirepo(id=891)

X = diabetes_health_indicators.data.features
y = diabetes_health_indicators.data.targets

# Make y a Series with correct name
if isinstance(y, pd.DataFrame):
    y = y.iloc[:, 0]
y.name = 'Diabetes_binary'

# Combine for easier handling
df = X.copy()
df['Diabetes_binary'] = y

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nTarget distribution:")
print(df['Diabetes_binary'].value_counts(normalize=True).round(4) * 100)

Dataset shape: (253680, 22)
Columns: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income', 'Diabetes_binary']

Target distribution:
Diabetes_binary
0    86.07
1    13.93
Name: proportion, dtype: float64


In [3]:
target = 'Diabetes_binary'
features = [col for col in df.columns if col != target]

binary_features = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke',
    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex'
]

ordinal_features = ['GenHlth', 'Age', 'Education', 'Income']

continuous_features = ['BMI', 'MentHlth', 'PhysHlth']

print(f"\nBinary features:     {len(binary_features)}")
print(f"Ordinal features:    {len(ordinal_features)}")
print(f"Continuous features: {len(continuous_features)}")
print(f"Total input features: {len(features)}")


Binary features:     14
Ordinal features:    4
Continuous features: 3
Total input features: 21


In [4]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape:     {X_test.shape}")
print("\nTrain class distribution:")
print(y_train.value_counts(normalize=True).round(4) * 100)


Training set shape: (202944, 21)
Test set shape:     (50736, 21)

Train class distribution:
Diabetes_binary
0    86.07
1    13.93
Name: proportion, dtype: float64


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features),
        ('ord', OrdinalEncoder(), ordinal_features),
    ],
    remainder='passthrough',          # binary features unchanged
    verbose_feature_names_out=True     # keep prefix style: num__BMI, ord__GenHlth, etc.
)

# Fit & transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed  = preprocessor.transform(X_test)

print(f"\nProcessed train shape: {X_train_processed.shape}")
print(f"Processed test shape:  {X_test_processed.shape}")


Processed train shape: (202944, 21)
Processed test shape:  (50736, 21)


In [6]:
feature_names = preprocessor.get_feature_names_out()
print("\nProcessed feature names:")
print(feature_names.tolist())

# Optional: cleaner names without prefixes (useful for plots)
clean_names = [name.split('__')[-1] for name in feature_names]
print("\nClean feature names (without prefixes):")
print(clean_names)


Processed feature names:
['num__BMI', 'num__MentHlth', 'num__PhysHlth', 'ord__GenHlth', 'ord__Age', 'ord__Education', 'ord__Income', 'remainder__HighBP', 'remainder__HighChol', 'remainder__CholCheck', 'remainder__Smoker', 'remainder__Stroke', 'remainder__HeartDiseaseorAttack', 'remainder__PhysActivity', 'remainder__Fruits', 'remainder__Veggies', 'remainder__HvyAlcoholConsump', 'remainder__AnyHealthcare', 'remainder__NoDocbcCost', 'remainder__DiffWalk', 'remainder__Sex']

Clean feature names (without prefixes):
['BMI', 'MentHlth', 'PhysHlth', 'GenHlth', 'Age', 'Education', 'Income', 'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']


In [7]:
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save preprocessor (fitted)
joblib.dump(preprocessor, '../models/diabetes_preprocessor.joblib')
print("Preprocessor saved → ../models/diabetes_preprocessor.joblib")

# Save processed arrays
np.save('../data/X_train_processed.npy', X_train_processed)
np.save('../data/X_test_processed.npy',  X_test_processed)
np.save('../data/y_train.npy', y_train.values)
np.save('../data/y_test.npy',  y_test.values)

# Save feature names (very useful for notebooks 03 & 04)
pd.DataFrame({'feature': feature_names}).to_csv(
    '../data/processed_feature_names.csv', index=False
)
pd.DataFrame({'clean_feature': clean_names}).to_csv(
    '../data/processed_feature_names_clean.csv', index=False
)

print("\nAll files saved successfully.")
print("→ You can now proceed to model training (notebook 03).")

Preprocessor saved → ../models/diabetes_preprocessor.joblib

All files saved successfully.
→ You can now proceed to model training (notebook 03).
