In [None]:
# 🔧 Repo Setup (Colab)
# This cell ensures a fresh copy of the repo is available in Colab every time.
# It removes any old copy, clones the latest repo, and sets up the environment.

import os, sys

REPO_URL = "https://github.com/mkennedy85/diabetes-crispdm.git"
REPO_DIR = "diabetes-crispdm"

# Always start fresh
!rm -rf $REPO_DIR
print("Cloning repo from:", REPO_URL)
!git clone $REPO_URL

# Change working directory to repo root
%cd $REPO_DIR

# Ensure Python can import the local package
import sys, os
sys.path.append(os.getcwd())

print("✅ Repo ready at:", os.getcwd())
print("Contents:", os.listdir('.'))

In [None]:
# 📤 Upload Dataset into Colab
# Run this cell to upload your CSV (e.g. diabetes_binary_health_indicators_BRFSS2015.csv)
# It will be placed into the "data/" folder.

from google.colab import files
import os

uploaded = files.upload()
os.makedirs("data", exist_ok=True)

for fn in uploaded.keys():
    os.rename(fn, f"data/{fn}")

print("Available files in data/:", os.listdir("data"))

In [None]:
# Set path to uploaded dataset
DATA_PATH = "data/diabetes_binary_health_indicators_BRFSS2015.csv"
# Or use multiclass dataset if uploaded:
# DATA_PATH = "data/diabetes_012_health_indicators_BRFSS2015.csv"

import pandas as pd

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


**Author:** Michael Kennedy  
**Run date:** 2025-09-14T04:04:01.008526Z

> This notebook follows CRISP-DM: Business → Data Understanding → Preparation → Modeling → Evaluation → Deployment.

**Constraints:** 8 GB RAM; pure NumPy/Pandas/Scikit + a compact PyTorch MLP for deep learning.


## Phase 2: Data Understanding

### Class distribution

![Class Distribution](../reports/figures/class_distribution.png)

### Top-variance correlation heatmap

![Correlation Heatmap](../reports/figures/corr_heatmap_top20.png)

### Univariate distributions of top features

![Univariate Top 6](../reports/figures/univariate_top6.png)

### Simple importance proxy (|corr| or variance ratio)

![Feature proxy](../reports/figures/feature_importance_proxy.png)

## Phase 3: Data Preparation & Splits

In [None]:

# This section is handled by the training scripts; refer to btds/train.py for deterministic splits and scaling.


## Phase 4: Baseline Modeling (Scikit)

![Confusion Matrix](../reports/figures/cm_logreg_test.png)

![ROC Curve](../reports/figures/roc_logreg_test.png)

![PR Curve](../reports/figures/pr_logreg_test.png)

![Calibration](../reports/figures/calibration_logreg_test.png)

![Permutation Importance](../reports/figures/permutation_importance_val.png)

## Phase 4: Deep Learning Baseline (PyTorch MLP)

In [None]:

# Run from terminal: python -m btds.train --data_path data/your_file.csv --out_dir reports
# The notebook focuses on EDA visuals while the training script saves metrics and model weights.


## Phase 5: Evaluation

In [None]:

import json, os, pprint
with open("reports/baseline_logreg_metrics.json") as f:
    metrics = json.load(f)
pprint.pp(metrics)
