# 04 — Hyperparameter Optimization with Ray Tune
*Generated: 2025-10-14 18:31*

This notebook shows two ways to do HPO with **Ray**:
1) **Native Ray Tune**: define a trainable function; works with any model (recommended).
2) **tune-sklearn**: sklearn-style `TuneSearchCV` if available.

It integrates with the **FlexibleModel** wrapper from file `03_model_controller.ipynb`.


In [None]:
# Optional installs (uncomment if needed)
# !pip -q install 'ray[tune]>=2.9.0' tune-sklearn imbalanced-learn


In [None]:
import os, sys, warnings, json
import numpy as np
import pandas as pd
from typing import Dict, Any

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

try:
    import ray
    from ray import tune
    from ray.tune.schedulers import ASHAScheduler
    from ray.tune.search.hyperopt import HyperOptSearch
    HAS_RAY = True
except Exception as e:
    HAS_RAY = False
    warnings.warn(f'Ray not available: {e}')

try:
    from tune_sklearn import TuneSearchCV
    HAS_TUNE_SKLEARN = True
except Exception:
    HAS_TUNE_SKLEARN = False

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

try:
    from imblearn.over_sampling import SMOTE
except Exception:
    SMOTE = None


In [None]:
try:
    # If you placed the class in the same dir as a .py, you could do:
    # from model_controller import FlexibleModel
    FlexibleModel  # noqa: just to check name exists if you ran 03 notebook already
except NameError:
    # Minimal fallback wrapper (subset of 03) to keep this notebook runnable standalone
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.metrics import accuracy_score, f1_score
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression, Perceptron
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.cluster import KMeans
    try:
        from imblearn.pipeline import Pipeline as ImbPipeline
    except Exception:
        ImbPipeline = None

    def _ensure_2d(X):
        if isinstance(X, pd.Series):
            return X.to_frame()
        return X

    class FlexibleModel:
        def __init__(self, scaler=None, sampler=None, clf=None):
            self.scaler = scaler
            self.sampler = sampler
            self.clf = clf or LogisticRegression(max_iter=1000)
            self.pipeline = None
        def _pre(self, X):
            if self.scaler is None:
                return None
            cols = X.select_dtypes(include=[np.number]).columns.tolist() if isinstance(X, pd.DataFrame) else list(range(X.shape[1]))
            num = Pipeline([
                ('imp', SimpleImputer(strategy='median')),
                ('sc', self.scaler)
            ])
            return ColumnTransformer([('num', num, cols)], remainder='passthrough')
        def _make(self, X):
            pre = self._pre(X)
            steps = [('pre', pre)] if pre is not None else []
            if self.sampler is not None:
                if ImbPipeline is None:
                    raise RuntimeError('imblearn missing for sampler usage')
                self.pipeline = ImbPipeline(steps + [('sampler', self.sampler), ('clf', self.clf)])
            else:
                from sklearn.pipeline import Pipeline
                self.pipeline = Pipeline(steps + [('clf', self.clf)])
        def fit(self, X, y):
            X = _ensure_2d(X)
            self._make(X)
            self.pipeline.fit(X, y)
            return self
        def predict(self, X, y=None):
            X = _ensure_2d(X)
            return self.pipeline.predict(X)
        def predict_proba(self, X):
            X = _ensure_2d(X)
            if hasattr(self.pipeline.named_steps['clf'], 'predict_proba'):
                return self.pipeline.predict_proba(X)[:,1]
            if hasattr(self.pipeline.named_steps['clf'], 'decision_function'):
                s = self.pipeline.decision_function(X)
                s = (s - s.min())/(s.max()-s.min()+1e-12)
                return s
            return self.pipeline.predict(X)


## A) Native Ray Tune (recommended)
Define a trainable that builds a `FlexibleModel` from a **config** dict, runs Stratified K-Fold, and reports the **ROC-AUC**.

In [None]:
def make_model_from_config(config: Dict[str, Any]):
    # Build components
    scaler = None
    if config.get('scaler') == 'standard':
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
    elif config.get('scaler') == 'none':
        scaler = None

    sampler = None
    if config.get('sampler') == 'smote' and SMOTE is not None:
        sampler = SMOTE(random_state=RANDOM_STATE, k_neighbors=int(config.get('smote_k', 5)))

    clf_type = config.get('clf', 'rf')
    if clf_type == 'rf':
        clf = RandomForestClassifier(
            n_estimators=int(config.get('n_estimators', 200)),
            max_depth=None if config.get('max_depth') is None else int(config.get('max_depth')),
            min_samples_leaf=int(config.get('min_samples_leaf', 1)),
            n_jobs=-1,
            random_state=RANDOM_STATE
        )
    elif clf_type == 'svm':
        clf = SVC(
            C=float(config.get('C', 1.0)),
            kernel=config.get('kernel', 'rbf'),
            gamma=config.get('gamma', 'scale'),
            probability=True,
            random_state=RANDOM_STATE
        )
    else:
        raise ValueError(f'Unsupported clf: {clf_type}')

    return FlexibleModel(scaler=scaler, sampler=sampler, clf=clf)


In [None]:
def tune_objective(config: Dict[str, Any], X=None, y=None, n_splits=3):
    # Expect X, y passed via tune.with_parameters
    model = make_model_from_config(config)
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    for tr, te in cv.split(X, y):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y.iloc[tr], y.iloc[te]
        model.fit(Xtr, ytr)
        yprob = model.predict_proba(Xte)
        # safety: flatten prob array
        yprob = np.asarray(yprob).reshape(-1)
        scores.append(roc_auc_score(yte, yprob))
    tune.report(roc_auc=float(np.mean(scores)))


### Example search spaces
Choose between RandomForest and SVM configurations with scaling/smote options.

In [None]:
search_space = {
    'clf': tune.choice(['rf','svm']),
    'scaler': tune.choice(['standard','none']),
    'sampler': tune.choice(['smote','none']),
    'smote_k': tune.choice([3,5,7])
}

# RF-only specific params (active when clf='rf')
rf_space = {
    'n_estimators': tune.qrandint(100, 500, 50),
    'max_depth': tune.choice([None, 5, 10, 20]),
    'min_samples_leaf': tune.choice([1, 2, 4])
}

# SVM-only specific params (active when clf='svm')
svm_space = {
    'C': tune.loguniform(1e-2, 1e2),
    'kernel': tune.choice(['rbf','linear']),
    'gamma': tune.choice(['scale','auto'])
}

def merged_space():
    # Conditional sampling inside the trainable is common; for display, we merge dicts
    base = dict(search_space)
    # Put superset; the objective will read the relevant keys only
    base.update({**rf_space, **svm_space})
    return base


In [None]:
def run_ray_tune(X, y, num_samples=20, max_trials=20):
    if not HAS_RAY:
        print('Ray not available. Please install ray[tune].')
        return None
    ray.init(ignore_reinit_error=True, include_dashboard=False)
    scheduler = ASHAScheduler(metric='roc_auc', mode='max', grace_period=1, reduction_factor=2)
    algo = HyperOptSearch(metric='roc_auc', mode='max')
    analysis = tune.run(
        tune.with_parameters(tune_objective, X=X, y=y, n_splits=3),
        config=merged_space(),
        num_samples=num_samples,
        scheduler=scheduler,
        search_alg=algo,
        resources_per_trial={'cpu': 1},
        local_dir='./ray_results',
        name='hpo_flexible_model'
    )
    print('Best config:', analysis.best_config)
    print('Best roc_auc:', analysis.best_result['roc_auc'])
    return analysis


## B) tune-sklearn (sklearn-compatible)
If installed, you can keep an sklearn-style flow using `TuneSearchCV`.

In [None]:
def run_tune_sklearn(X, y):
    if not HAS_TUNE_SKLEARN:
        print('tune-sklearn not available. Please install tune-sklearn.')
        return None
    # Build a Pipeline estimator
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    est = Pipeline([
        ('sc', StandardScaler()),
        ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))
    ])
    # Search space uses ray.tune distributions where possible
    param_distributions = {
        'clf__n_estimators': tune.qrandint(100, 500, 50),
        'clf__max_depth': tune.choice([None, 5, 10, 20]),
        'clf__min_samples_leaf': tune.choice([1, 2, 4])
    }
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    tuner = TuneSearchCV(
        est, param_distributions=param_distributions,
        n_trials=20,
        scoring='roc_auc',
        cv=cv,
        search_optimization='hyperopt',
        early_stopping=True,
        max_iters=10,
        verbose=1
    )
    tuner.fit(X, y)
    print('Best params:', tuner.best_params_)
    print('Best score:', tuner.best_score_)
    return tuner


## Finalize best model
After getting `best_config` from Ray Tune (native), rebuild a final model and `fit` on the full training set.

In [None]:
def build_best_model(best_config: Dict[str, Any]):
    return make_model_from_config(best_config)

# Example:
# analysis = run_ray_tune(X_train, y_train, num_samples=30)
# best = analysis.best_config
# final_model = build_best_model(best)
# final_model.fit(X_train, y_train)
# y_pred = final_model.predict(X_test)
# y_prob = final_model.predict_proba(X_test)


In [None]:
# What’s inside:

# Native Ray Tune trainable that builds your FlexibleModel from a config, runs Stratified K-Fold, and reports ROC-AUC (with ASHA scheduler + HyperOpt search).
# Optional tune-sklearn (TuneSearchCV) example for a scikit-learn-style flow.
# Search spaces for RF and SVM with conditional params, plus toggles for scaler and SMOTE.
# Helper to rebuild the best model from best_config and fit on your full train set.
# Usage (native Ray Tune):

# # Assume X_train, y_train are ready
# # Optional: !pip install "ray[tune]" tune-sklearn imbalanced-learn

# from 04_hpo_ray import run_ray_tune, build_best_model  # or run the cells directly
# analysis = run_ray_tune(X_train, y_train, num_samples=30)
# best = analysis.best_config
# final_model = build_best_model(best)
# final_model.fit(X_train, y_train)
# y_pred  = final_model.predict(X_test)
# y_prob  = final_model.predict_proba(X_test)