In [None]:
import numpy as np
import pandas as pd
import warnings
import json
import joblib
import time
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import optuna

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(42)

# =============================================================================
# CONFIGURATION
# =============================================================================
print("="*80)
print("SVR HYPERPARAMETER OPTIMIZATION")
print("="*80)

# =============================================================================
# LOAD PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("LOADING PREPROCESSED DATA")
print("="*80)

X_train = joblib.load('../../data/preprocessed/X_train.pkl')
y_train = joblib.load('../../data/preprocessed/y_train.pkl')
metadata = joblib.load('../../data/preprocessed/metadata.pkl')

print(f"✓ Train shape: {X_train.shape}")
print(f"✓ Target shape: {y_train.shape}")

# =============================================================================
# OPTIMIZATION SETTINGS
# =============================================================================
FAST_MODE = True  # Set to False for more thorough search
N_TRIALS = 5 if FAST_MODE else 20
N_FOLDS = 5

print(f"\n{'='*60}")
print(f"OPTIMIZATION SETTINGS")
print(f"{'='*60}")
print(f"Mode: {'FAST' if FAST_MODE else 'THOROUGH'}")
print(f"Trials: {N_TRIALS}")
print(f"Folds: {N_FOLDS}")

# =============================================================================
# OBJECTIVE FUNCTION
# =============================================================================
def objective_svr(trial, X, y):
    """Objective function for SVR optimization"""
    C = trial.suggest_float('C', 1, 100, log=True)
    epsilon = trial.suggest_float('epsilon', 0.01, 1.0)
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

    scaler = StandardScaler()
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        X_tr_scaled = scaler.fit_transform(X_tr)
        X_val_scaled = scaler.transform(X_val)

        # Subsample for speed (SVR is slow on large datasets)
        sample_size = min(50000, len(X_tr))
        idx = np.random.choice(len(X_tr), sample_size, replace=False)

        model = SVR(C=C, epsilon=epsilon, gamma=gamma, kernel='rbf')
        model.fit(X_tr_scaled[idx], y_tr[idx])

        preds = np.clip(model.predict(X_val_scaled), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

# =============================================================================
# RUN OPTIMIZATION
# =============================================================================
print("\n" + "="*80)
print("STARTING BAYESIAN OPTIMIZATION")
print("="*80)
print("Note: Using data sampling for speed (SVR is computationally intensive)")

start_time = time.time()

study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(
    lambda trial: objective_svr(trial, X_train, y_train),
    n_trials=N_TRIALS,
    show_progress_bar=True,
    n_jobs=2
)

optimization_time = time.time() - start_time

# =============================================================================
# RESULTS
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"Best RMSE: {study.best_value:.6f}")
print(f"Optimization time: {optimization_time:.1f}s")
print(f"\nBest parameters:")
for param, value in study.best_params.items():
    print(f"  {param:20s}: {value}")

# =============================================================================
# SAVE RESULTS
# =============================================================================

# Save best parameters
best_params = study.best_params.copy()
best_params['kernel'] = 'rbf'

joblib.dump(best_params, './svr_params.pkl')
print("\n✓ Parameters saved to: ./svr_params.pkl")

# Save optimization history
history = pd.DataFrame({
    'trial': [t.number for t in study.trials],
    'value': [t.value for t in study.trials],
    'params': [str(t.params) for t in study.trials]
})
history.to_csv('./svr_history.csv', index=False)
print("✓ History saved to: ./svr_history.csv")

# Save study object
joblib.dump(study, './svr_study.pkl')
print("✓ Study saved to: ./svr_study.pkl")

# Save summary
summary = {
    'model': 'SVR',
    'best_rmse': study.best_value,
    'n_trials': N_TRIALS,
    'n_folds': N_FOLDS,
    'optimization_time': optimization_time,
    'best_params': best_params
}
joblib.dump(summary, './svr_summary.pkl')
print("✓ Summary saved to: ./svr_summary.pkl")

print("\n" + "="*80)
print("SVR OPTIMIZATION COMPLETE!")
print("="*80)
print(f"✓ Best CV RMSE: {study.best_value:.6f}")
print(f"✓ All results saved in './'")

SVR HYPERPARAMETER OPTIMIZATION

LOADING PREPROCESSED DATA
✓ Train shape: (630000, 9)
✓ Target shape: (630000,)

OPTIMIZATION SETTINGS
Mode: FAST
Trials: 5
Folds: 5

STARTING BAYESIAN OPTIMIZATION
Note: Using data sampling for speed (SVR is computationally intensive)


  0%|          | 0/5 [00:00<?, ?it/s]