In [None]:
import numpy as np
import pandas as pd
import warnings
import json
import joblib
import time
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import optuna
import torch

RANDOM_SEED = 0

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(RANDOM_SEED)

# =============================================================================
# GPU CONFIGURATION
# =============================================================================
print("="*80)
print("CATBOOST HYPERPARAMETER OPTIMIZATION")
print("="*80)
if torch.cuda.is_available():
    print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
    USE_GPU = True
else:
    print("⚠ No GPU detected, using CPU")
    USE_GPU = False

# =============================================================================
# LOAD PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("LOADING PREPROCESSED DATA")
print("="*80)

X_train = joblib.load('../../data/preprocessed/X_train.pkl')
y_train = joblib.load('../../data/preprocessed/y_train.pkl')
X_test = joblib.load('../../data/preprocessed/X_test.pkl')
metadata = joblib.load('../../data/preprocessed/metadata.pkl')

print(f"✓ Train shape: {X_train.shape}")
print(f"✓ Target shape: {y_train.shape}")
print(f"✓ Test shape: {X_test.shape}")
print(f"✓ GPU available: {USE_GPU}")

# =============================================================================
# OPTIMIZATION SETTINGS
# =============================================================================
FAST_MODE = True  # Set to False for more thorough search
N_TRIALS = 1 if FAST_MODE else 400
N_FOLDS = 5

print(f"\n{'='*60}")
print(f"OPTIMIZATION SETTINGS")
print(f"{'='*60}")
print(f"Mode: {'FAST' if FAST_MODE else 'THOROUGH'}")
print(f"Trials: {N_TRIALS}")
print(f"Folds: {N_FOLDS}")

# =============================================================================
# OBJECTIVE FUNCTION
# =============================================================================
def objective_cat(trial, X, y):
    """Objective function for CatBoost optimization"""
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 2),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'task_type': 'GPU' if USE_GPU else 'CPU',
        'devices': '0' if USE_GPU else None,
        'verbose': False,
        'random_seed': RANDOM_SEED
    }

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)

        preds = np.clip(model.predict(X_val), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

# =============================================================================
# RUN OPTIMIZATION
# =============================================================================
print("\n" + "="*80)
print("STARTING BAYESIAN OPTIMIZATION")
print("="*80)

start_time = time.time()

study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)

study.optimize(
    lambda trial: objective_cat(trial, X_train, y_train),
    n_trials=N_TRIALS,
    show_progress_bar=True,
    n_jobs=1
)

optimization_time = time.time() - start_time

# =============================================================================
# RESULTS
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"Best RMSE: {study.best_value:.6f}")
print(f"Optimization time: {optimization_time:.1f}s")
print(f"\nBest parameters:")
for param, value in study.best_params.items():
    print(f"  {param:20s}: {value}")

# =============================================================================
# SAVE RESULTS
# =============================================================================
# Save best parameters
best_params = study.best_params.copy()
best_params.update({
    'task_type': 'GPU' if USE_GPU else 'CPU',
    'devices': '0' if USE_GPU else None,
    'verbose': False,
    'random_seed': RANDOM_SEED
})
best_params = {k: v for k, v in best_params.items() if v is not None}

joblib.dump(best_params, 'catboost_params.pkl')
print("\n✓ Parameters saved to: catboost_params.pkl")

# Save optimization history
history = pd.DataFrame({
    'trial': [t.number for t in study.trials],
    'value': [t.value for t in study.trials],
    'params': [str(t.params) for t in study.trials]
})
history.to_csv('catboost_history.csv', index=False)
print("✓ History saved to: catboost_history.csv")

# Save study object
joblib.dump(study, 'catboost_study.pkl')
print("✓ Study saved to: catboost_study.pkl")

# Save summary
summary = {
    'model': 'CatBoost',
    'best_rmse': study.best_value,
    'n_trials': N_TRIALS,
    'n_folds': N_FOLDS,
    'optimization_time': optimization_time,
    'use_gpu': USE_GPU,
    'best_params': best_params
}
joblib.dump(summary, 'catboost_summary.pkl')
print("✓ Summary saved to: catboost_summary.pkl")

# =============================================================================
# TRAIN FINAL MODEL ON FULL TRAINING DATA
# =============================================================================
print("\n" + "="*80)
print("TRAINING FINAL MODEL ON FULL DATASET")
print("="*80)

final_model = CatBoostRegressor(**best_params)
final_model.set_params(verbose=100)  # Show training progress

print("Training final model...")
final_model.fit(X_train, y_train)

# Save final model
joblib.dump(final_model, 'catboost_final_model.pkl')
print("\n✓ Final model saved to: catboost_final_model.pkl")

# =============================================================================
# GENERATE PREDICTIONS AND SUBMISSION FILE
# =============================================================================
print("\n" + "="*80)
print("GENERATING PREDICTIONS")
print("="*80)

# Make predictions on test set
test_predictions = final_model.predict(X_test)

# Clip predictions to valid range [0, 100]
test_predictions = np.clip(test_predictions, 0, 100)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': range(len(test_predictions)),
    'exam_score': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)
print("✓ Submission file saved to: submission.csv")

print(f"\nSubmission statistics:")
print(f"  Min prediction: {test_predictions.min():.2f}")
print(f"  Max prediction: {test_predictions.max():.2f}")
print(f"  Mean prediction: {test_predictions.mean():.2f}")
print(f"  Median prediction: {np.median(test_predictions):.2f}")
print(f"  Shape: {submission.shape}")

# =============================================================================
# FINAL SUMMARY
# =============================================================================
print("\n" + "="*80)
print("CATBOOST OPTIMIZATION COMPLETE!")
print("="*80)
print(f"✓ Best CV RMSE: {study.best_value:.6f}")
print(f"✓ Final model trained on {len(X_train)} samples")
print(f"✓ Predictions generated for {len(X_test)} test samples")
print(f"✓ All results saved")
print("\nFiles created:")
print("  • catboost_params.pkl")
print("  • catboost_history.csv")
print("  • catboost_study.pkl")
print("  • catboost_summary.pkl")
print("  • catboost_final_model.pkl")
print("  • submission.csv")
print("="*80)

CATBOOST HYPERPARAMETER OPTIMIZATION
⚠ No GPU detected, using CPU

LOADING PREPROCESSED DATA
✓ Train shape: (100, 8)
✓ Target shape: (100,)
✓ Test shape: (10, 8)
✓ GPU available: False

OPTIMIZATION SETTINGS
Mode: FAST
Trials: 1
Folds: 5

STARTING BAYESIAN OPTIMIZATION


  0%|          | 0/1 [00:00<?, ?it/s]


OPTIMIZATION RESULTS
Best RMSE: 9.860726
Optimization time: 1.5s

Best parameters:
  iterations          : 2098
  learning_rate       : 0.07436704297351776
  depth               : 8
  l2_leaf_reg         : 5.903948646972072
  bagging_temperature : 0.4236547993389047
  random_strength     : 1.2917882261333122
  border_count        : 130

✓ Parameters saved to: catboost_params.pkl
✓ History saved to: catboost_history.csv
✓ Study saved to: catboost_study.pkl
✓ Summary saved to: catboost_summary.pkl

TRAINING FINAL MODEL ON FULL DATASET
Training final model...
0:	learn: 19.3185983	total: 1.32ms	remaining: 2.76s
100:	learn: 4.9385701	total: 123ms	remaining: 2.43s
200:	learn: 3.1253466	total: 257ms	remaining: 2.43s
300:	learn: 1.8445661	total: 411ms	remaining: 2.45s
400:	learn: 1.0950255	total: 597ms	remaining: 2.52s
500:	learn: 0.6493956	total: 716ms	remaining: 2.28s
600:	learn: 0.3951759	total: 869ms	remaining: 2.16s
700:	learn: 0.2446856	total: 1000ms	remaining: 1.99s
800:	learn: 0.14822