In [None]:
import numpy as np
import pandas as pd
import warnings
import joblib
import time
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import optuna
import torch

RANDOM_SEED = 0

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(RANDOM_SEED)

# =============================================================================
# GPU CONFIGURATION
# =============================================================================
print("="*80)
print("LIGHTGBM HYPERPARAMETER OPTIMIZATION")
print("="*80)

if torch.cuda.is_available():
    print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
    USE_GPU = True
else:
    print("⚠ No GPU detected, using CPU")
    USE_GPU = False

# =============================================================================
# LOAD PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("LOADING PREPROCESSED DATA")
print("="*80)

X_train = joblib.load('../../data/preprocessed/X_train.pkl')
y_train = joblib.load('../../data/preprocessed/y_train.pkl')
X_test = joblib.load('../../data/preprocessed/X_test.pkl')
metadata = joblib.load('../../data/preprocessed/metadata.pkl')

print(f"✓ Train shape: {X_train.shape}")
print(f"✓ Target shape: {y_train.shape}")
print(f"✓ Test shape: {X_test.shape}")
print(f"✓ GPU available: {USE_GPU}")

# =============================================================================
# OPTIMIZATION SETTINGS
# =============================================================================
FAST_MODE = False  # Set to False for more thorough search
N_TRIALS = 15 if FAST_MODE else 400
N_FOLDS = 5
N_JOBS_OPTUNA = 4  # Parallel trials

print(f"\n{'='*60}")
print(f"OPTIMIZATION SETTINGS")
print(f"{'='*60}")
print(f"Mode: {'FAST' if FAST_MODE else 'THOROUGH'}")
print(f"Trials: {N_TRIALS}")
print(f"Folds: {N_FOLDS}")
print(f"Parallel jobs: {N_JOBS_OPTUNA}")

# =============================================================================
# OBJECTIVE FUNCTION
# =============================================================================
def objective_lgb(trial, X, y):
    """Objective function for LightGBM optimization"""
    
    # Base parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'device': 'gpu' if USE_GPU else 'cpu',
        'seed': RANDOM_SEED,
        'force_col_wise': True,  # Faster on CPU
    }
    
    # GPU-specific parameters
    if USE_GPU:
        params['gpu_use_dp'] = False  # Use single precision on GPU
    
    # Hyperparameters to optimize
    params.update({
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10.0),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 5.0),
    })
    
    num_boost_round = trial.suggest_int('num_boost_round', 100, 3000)
    
    # Remove None values
    params = {k: v for k, v in params.items() if v is not None}

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        
        # Handle both pandas Series and numpy arrays
        if isinstance(y, pd.Series):
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        else:
            y_tr, y_val = y[train_idx], y[val_idx]

        # Create LightGBM datasets
        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        # Train with early stopping
        model = lgb.train(
            params,
            train_data,
            num_boost_round=num_boost_round,
            valid_sets=[val_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=0)  # Suppress output
            ]
        )

        # Predict and clip
        preds = np.clip(model.predict(X_val), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

# =============================================================================
# RUN OPTIMIZATION
# =============================================================================
print("\n" + "="*80)
print("STARTING BAYESIAN OPTIMIZATION")
print("="*80)

start_time = time.time()

study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)

study.optimize(
    lambda trial: objective_lgb(trial, X_train, y_train),
    n_trials=N_TRIALS,
    show_progress_bar=True,
    n_jobs=N_JOBS_OPTUNA
)

optimization_time = time.time() - start_time

# =============================================================================
# RESULTS
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"Best RMSE: {study.best_value:.6f}")
print(f"Optimization time: {optimization_time:.1f}s")
print(f"\nBest parameters:")
for param, value in sorted(study.best_params.items()):
    print(f"  {param:20s}: {value}")

# =============================================================================
# SAVE RESULTS
# =============================================================================
# Prepare final parameters
best_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'device': 'gpu' if USE_GPU else 'cpu',
    'seed': RANDOM_SEED,
    'force_col_wise': True,
}

if USE_GPU:
    best_params['gpu_use_dp'] = False

best_params.update(study.best_params)

# Extract num_boost_round separately
num_boost_round = best_params.pop('num_boost_round')

# Remove None values
best_params = {k: v for k, v in best_params.items() if v is not None}

joblib.dump(best_params, 'lightgbm_params.pkl')
joblib.dump(num_boost_round, 'lightgbm_num_boost_round.pkl')
print("\n✓ Parameters saved to: lightgbm_params.pkl")
print(f"✓ num_boost_round: {num_boost_round}")

# Save optimization history
history = pd.DataFrame({
    'trial': [t.number for t in study.trials],
    'value': [t.value for t in study.trials],
    'params': [str(t.params) for t in study.trials]
})
history.to_csv('lightgbm_history.csv', index=False)
print("✓ History saved to: lightgbm_history.csv")

# Save study object
joblib.dump(study, 'lightgbm_study.pkl')
print("✓ Study saved to: lightgbm_study.pkl")

# Save summary
summary = {
    'model': 'LightGBM',
    'best_rmse': study.best_value,
    'n_trials': N_TRIALS,
    'n_folds': N_FOLDS,
    'optimization_time': optimization_time,
    'use_gpu': USE_GPU,
    'best_params': best_params,
    'num_boost_round': num_boost_round
}
joblib.dump(summary, 'lightgbm_summary.pkl')
print("✓ Summary saved to: lightgbm_summary.pkl")

# =============================================================================
# TRAIN FINAL MODEL ON FULL TRAINING DATA
# =============================================================================
print("\n" + "="*80)
print("TRAINING FINAL MODEL ON FULL DATASET")
print("="*80)

# Create LightGBM dataset for full training data
train_data_full = lgb.Dataset(X_train, label=y_train)

print(f"Training final model with {num_boost_round} rounds...")
final_model = lgb.train(
    best_params,
    train_data_full,
    num_boost_round=num_boost_round,
    callbacks=[lgb.log_evaluation(period=100)]  # Show progress every 100 rounds
)

# Save final model
final_model.save_model('lightgbm_final_model.txt')
joblib.dump(final_model, 'lightgbm_final_model.pkl')
print("\n✓ Final model saved to: lightgbm_final_model.txt")
print("✓ Final model saved to: lightgbm_final_model.pkl")

# =============================================================================
# GENERATE PREDICTIONS AND SUBMISSION FILE
# =============================================================================
print("\n" + "="*80)
print("GENERATING PREDICTIONS")
print("="*80)

# Make predictions on test set
test_predictions = final_model.predict(X_test)

# Clip predictions to valid range [0, 100]
test_predictions = np.clip(test_predictions, 0, 100)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': range(len(X_train), len(test_predictions) + len(X_train)),
    'test_score': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)
print("✓ Submission file saved to: submission.csv")

print(f"\nSubmission statistics:")
print(f"  Min prediction: {test_predictions.min():.2f}")
print(f"  Max prediction: {test_predictions.max():.2f}")
print(f"  Mean prediction: {test_predictions.mean():.2f}")
print(f"  Median prediction: {np.median(test_predictions):.2f}")
print(f"  Std prediction: {test_predictions.std():.2f}")
print(f"  Shape: {submission.shape}")

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================
print("\n" + "="*80)
print("FEATURE IMPORTANCE")
print("="*80)

# Get feature importance (using 'gain' as importance type)
importance_gain = final_model.feature_importance(importance_type='gain')
importance_split = final_model.feature_importance(importance_type='split')

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance_gain': importance_gain,
    'importance_split': importance_split
}).sort_values('importance_gain', ascending=False)

print("\nTop 10 most important features (by gain):")
print(feature_importance.head(10)[['feature', 'importance_gain']].to_string(index=False))

# Save feature importance
feature_importance.to_csv('lightgbm_feature_importance.csv', index=False)
print("\n✓ Feature importance saved to: lightgbm_feature_importance.csv")

# =============================================================================
# MODEL STATISTICS
# =============================================================================
print("\n" + "="*80)
print("MODEL STATISTICS")
print("="*80)
print(f"Number of boosting rounds: {num_boost_round}")
print(f"Number of trees: {final_model.num_trees()}")
print(f"Number of features: {final_model.num_feature()}")
print(f"Learning rate: {best_params['learning_rate']:.4f}")
print(f"Num leaves: {best_params['num_leaves']}")
print(f"Max depth: {best_params['max_depth']}")

# =============================================================================
# VISUALIZATIONS
# =============================================================================
print("\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Optimization history
ax1 = axes[0, 0]
history = pd.read_csv('lightgbm_history.csv')
ax1.plot(history['trial'], history['value'], alpha=0.6, marker='o', markersize=4)
ax1.plot(history['trial'], history['value'].cummin(), 
         linewidth=2.5, label='Best score', color='red')
ax1.set_xlabel('Trial', fontsize=11, fontweight='bold')
ax1.set_ylabel('RMSE', fontsize=11, fontweight='bold')
ax1.set_title('Optimization Progress', fontsize=12, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Feature importance (top 15 by gain)
ax2 = axes[0, 1]
top_features = feature_importance.head(15)
y_pos = np.arange(len(top_features))
ax2.barh(y_pos, top_features['importance_gain'], color='lightgreen', edgecolor='black')
ax2.set_yticks(y_pos)
ax2.set_yticklabels(top_features['feature'], fontsize=9)
ax2.invert_yaxis()
ax2.set_xlabel('Importance (Gain)', fontsize=11, fontweight='bold')
ax2.set_title('Top 15 Feature Importances', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')

# 3. Feature importance comparison (gain vs split)
ax3 = axes[1, 0]
ax3.scatter(feature_importance['importance_split'], 
           feature_importance['importance_gain'],
           alpha=0.6, s=50)
ax3.set_xlabel('Importance (Split)', fontsize=11, fontweight='bold')
ax3.set_ylabel('Importance (Gain)', fontsize=11, fontweight='bold')
ax3.set_title('Feature Importance: Gain vs Split', fontsize=12, fontweight='bold')
ax3.grid(True, alpha=0.3)

# Add correlation coefficient
corr = np.corrcoef(feature_importance['importance_split'], 
                   feature_importance['importance_gain'])[0, 1]
ax3.text(0.05, 0.95, f'Correlation: {corr:.3f}',
        transform=ax3.transAxes, verticalalignment='top',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 4. Prediction distribution
ax4 = axes[1, 1]
ax4.hist(test_predictions, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
ax4.axvline(test_predictions.mean(), color='red', linestyle='--', 
           linewidth=2, label=f'Mean: {test_predictions.mean():.2f}')
ax4.axvline(np.median(test_predictions), color='green', linestyle='--',
           linewidth=2, label=f'Median: {np.median(test_predictions):.2f}')
ax4.set_xlabel('Predicted Score', fontsize=11, fontweight='bold')
ax4.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax4.set_title('Test Predictions Distribution', fontsize=12, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('lightgbm_analysis.png', dpi=300, bbox_inches='tight')
print("✓ Analysis plot saved to: lightgbm_analysis.png")
plt.show()

# =============================================================================
# FINAL SUMMARY
# =============================================================================
print("\n" + "="*80)
print("LIGHTGBM OPTIMIZATION COMPLETE!")
print("="*80)
print(f"✓ Best CV RMSE: {study.best_value:.6f}")
print(f"✓ Final model trained on {len(X_train)} samples")
print(f"✓ Model has {final_model.num_trees()} trees")
print(f"✓ Predictions generated for {len(X_test)} test samples")
print(f"✓ All results saved")
print("\nFiles created:")
print("  • lightgbm_params.pkl")
print("  • lightgbm_num_boost_round.pkl")
print("  • lightgbm_history.csv")
print("  • lightgbm_study.pkl")
print("  • lightgbm_summary.pkl")
print("  • lightgbm_final_model.txt")
print("  • lightgbm_final_model.pkl")
print("  • lightgbm_feature_importance.csv")
print("  • lightgbm_analysis.png")
print("  • submission.csv")
print("="*80)

LIGHTGBM HYPERPARAMETER OPTIMIZATION
⚠ No GPU detected, using CPU

LOADING PREPROCESSED DATA
✓ Train shape: (630000, 9)
✓ Target shape: (630000,)
✓ GPU available: False

OPTIMIZATION SETTINGS
Mode: FAST
Trials: 20
Folds: 4

STARTING BAYESIAN OPTIMIZATION


  0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[993]	valid_0's rmse: 8.82734
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 8.83552
[W 2026-01-10 15:47:42,096] Trial 0 failed with parameters: {'learning_rate': 0.04370861069626263} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/lorenzo/Documenti/datascience_projects/students-scores/.venv/lib/python3.13/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_10565/1394054503.py", line 122, in <lambda>
    lambda trial: objective_lgb(trial, X_train, y_train),
                  ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_10565/1394054503.py", line 101, in objective_lgb
    preds = np.clip(model.predict(X_val), 0, 100)
                    ~~~~~~~~~~~~~^^^^^^^


KeyboardInterrupt: 