In [None]:
import numpy as np
import pandas as pd
import warnings
import joblib
import time
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import optuna
import torch

RANDOM_SEED = 0

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(RANDOM_SEED)

# =============================================================================
# GPU CONFIGURATION
# =============================================================================
print("="*80)
print("XGBOOST HYPERPARAMETER OPTIMIZATION (Native API)")
print("="*80)

if torch.cuda.is_available():
    print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
    USE_GPU = True
    DEVICE = 'cuda:0'
else:
    print("⚠ No GPU detected, using CPU")
    USE_GPU = False
    DEVICE = 'cpu'

# =============================================================================
# LOAD PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("LOADING PREPROCESSED DATA")
print("="*80)

X_train = joblib.load('../../data/preprocessed/X_train.pkl')
y_train = joblib.load('../../data/preprocessed/y_train.pkl')
X_test = joblib.load('../../data/preprocessed/X_test.pkl')
metadata = joblib.load('../../data/preprocessed/metadata.pkl')

print(f"✓ Train shape: {X_train.shape}")
print(f"✓ Target shape: {y_train.shape}")
print(f"✓ Test shape: {X_test.shape}")
print(f"✓ Device: {DEVICE}")

# =============================================================================
# OPTIMIZATION SETTINGS
# =============================================================================
FAST_MODE = False  # Set to False for more thorough search
N_TRIALS = 1 if FAST_MODE else 400
N_FOLDS = 5

print(f"\n{'='*60}")
print(f"OPTIMIZATION SETTINGS")
print(f"{'='*60}")
print(f"Mode: {'FAST' if FAST_MODE else 'THOROUGH'}")
print(f"Trials: {N_TRIALS}")
print(f"Folds: {N_FOLDS}")

# =============================================================================
# OBJECTIVE FUNCTION
# =============================================================================
def objective_xgb(trial, X, y):
    """Objective function for XGBoost optimization (Native API)"""
    
    # Base parameters (always included)
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'device': DEVICE,
        'predictor': 'gpu_predictor' if USE_GPU else 'cpu_predictor',
        'seed': RANDOM_SEED,
    }
    
    # Hyperparameters to optimize
    params.update({
        "n_estimators": trial.suggest_int("n_estimators", 100, 10000),
        "num_boost_round": trial.suggest_int("num_boost_round", 3000, 15000), 
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 50),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1, 50, log=True),
        'alpha': trial.suggest_float('alpha', 0, 20, log=True),
    })
    
    # Remove None values
    params = {k: v for k, v in params.items() if v is not None}
    
    num_boost_round = trial.suggest_int('num_boost_round', 1000, 3000)

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        
        # Use numpy indexing for y (no .iloc)
        if isinstance(y, pd.Series):
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        else:  # numpy array
            y_tr, y_val = y[train_idx], y[val_idx]
        
        # Create DMatrix (optimized data structure for XGBoost)
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        # Train with early stopping
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dval, 'eval')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        # Predict and clip to valid range
        preds = np.clip(model.predict(dval), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

# =============================================================================
# RUN OPTIMIZATION
# =============================================================================
print("\n" + "="*80)
print("STARTING BAYESIAN OPTIMIZATION")
print("="*80)

start_time = time.time()

study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)

study.optimize(
    lambda trial: objective_xgb(trial, X_train, y_train),
    n_trials=N_TRIALS,
    show_progress_bar=True,
    n_jobs=1
)

optimization_time = time.time() - start_time

# =============================================================================
# RESULTS
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"Best RMSE: {study.best_value:.6f}")
print(f"Optimization time: {optimization_time:.1f}s")
print(f"\nBest parameters:")
for param, value in sorted(study.best_params.items()):
    print(f"  {param:20s}: {value}")

# =============================================================================
# SAVE RESULTS
# =============================================================================
# Prepare final parameters
best_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': DEVICE,
    'predictor': 'gpu_predictor' if USE_GPU else 'cpu_predictor',
    'seed': RANDOM_SEED,
}
best_params.update(study.best_params)

# Extract num_boost_round separately (it's not a model param)
num_boost_round = best_params.pop('num_boost_round')

# Remove None values
best_params = {k: v for k, v in best_params.items() if v is not None}

joblib.dump(best_params, 'xgboost_params.pkl')
joblib.dump(num_boost_round, 'xgboost_num_boost_round.pkl')
print("\n✓ Parameters saved to: xgboost_params.pkl")
print(f"✓ num_boost_round: {num_boost_round}")

# Save optimization history
history = pd.DataFrame({
    'trial': [t.number for t in study.trials],
    'value': [t.value for t in study.trials],
    'params': [str(t.params) for t in study.trials]
})
history.to_csv('xgboost_history.csv', index=False)
print("✓ History saved to: xgboost_history.csv")

# Save study object
joblib.dump(study, 'xgboost_study.pkl')
print("✓ Study saved to: xgboost_study.pkl")

# Save summary
summary = {
    'model': 'XGBoost (Native API)',
    'best_rmse': study.best_value,
    'n_trials': N_TRIALS,
    'n_folds': N_FOLDS,
    'optimization_time': optimization_time,
    'device': DEVICE,
    'use_gpu': USE_GPU,
    'best_params': best_params,
    'num_boost_round': num_boost_round
}
joblib.dump(summary, 'xgboost_summary.pkl')
print("✓ Summary saved to: xgboost_summary.pkl")

# =============================================================================
# TRAIN FINAL MODEL ON FULL TRAINING DATA
# =============================================================================
print("\n" + "="*80)
print("TRAINING FINAL MODEL ON FULL DATASET")
print("="*80)

# Create DMatrix for full training data
dtrain_full = xgb.DMatrix(X_train, label=y_train)

print(f"Training final model with {num_boost_round} rounds...")
final_model = xgb.train(
    best_params,
    dtrain_full,
    num_boost_round=num_boost_round,
    verbose_eval=100  # Show progress every 100 rounds
)

# Save final model in multiple formats
final_model.save_model('xgboost_final_model.json')
joblib.dump(final_model, 'xgboost_final_model.pkl')
print("\n✓ Final model saved to: xgboost_final_model.json")
print("✓ Final model saved to: xgboost_final_model.pkl")

# =============================================================================
# GENERATE PREDICTIONS AND SUBMISSION FILE
# =============================================================================
print("\n" + "="*80)
print("GENERATING PREDICTIONS")
print("="*80)

# Create DMatrix for test data
dtest = xgb.DMatrix(X_test)

# Make predictions on test set
test_predictions = final_model.predict(dtest)

# Clip predictions to valid range [0, 100]
test_predictions = np.clip(test_predictions, 0, 100)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': range(len(X_train), len(test_predictions) + len(X_train)),
    'exam_score': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)
print("✓ Submission file saved to: submission.csv")

print(f"\nSubmission statistics:")
print(f"  Min prediction: {test_predictions.min():.2f}")
print(f"  Max prediction: {test_predictions.max():.2f}")
print(f"  Mean prediction: {test_predictions.mean():.2f}")
print(f"  Median prediction: {np.median(test_predictions):.2f}")
print(f"  Std prediction: {test_predictions.std():.2f}")
print(f"  Shape: {submission.shape}")

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================
print("\n" + "="*80)
print("FEATURE IMPORTANCE")
print("="*80)

# Get feature importance (using 'gain' as importance type)
importance_dict = final_model.get_score(importance_type='gain')

if importance_dict:
    # Create DataFrame
    feature_importance = pd.DataFrame({
        'feature': list(importance_dict.keys()),
        'importance': list(importance_dict.values())
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 most important features:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Save feature importance
    feature_importance.to_csv('xgboost_feature_importance.csv', index=False)
    print("\n✓ Feature importance saved to: xgboost_feature_importance.csv")
else:
    print("⚠ No feature importance available (model might have no splits)")

# =============================================================================
# FINAL SUMMARY
# =============================================================================
print("\n" + "="*80)
print("XGBOOST OPTIMIZATION COMPLETE!")
print("="*80)
print(f"✓ Best CV RMSE: {study.best_value:.6f}")
print(f"✓ Final model trained on {len(X_train)} samples")
print(f"✓ Model has {num_boost_round} trees")
print(f"✓ Predictions generated for {len(X_test)} test samples")
print(f"✓ All results saved")
print("\nFiles created:")
print("  • xgboost_params.pkl")
print("  • xgboost_num_boost_round.pkl")
print("  • xgboost_history.csv")
print("  • xgboost_study.pkl")
print("  • xgboost_summary.pkl")
print("  • xgboost_final_model.json")
print("  • xgboost_final_model.pkl")
print("  • xgboost_feature_importance.csv")
print("  • submission.csv")
print("="*80)