In [2]:
import numpy as np
import pandas as pd
import warnings
import joblib
import time
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
import optuna

RANDOM_SEED = 0

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(RANDOM_SEED)

# =============================================================================
# CONFIGURATION
# =============================================================================
print("="*80)
print("ELASTICNET REGRESSION HYPERPARAMETER OPTIMIZATION")
print("="*80)

# =============================================================================
# LOAD PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("LOADING PREPROCESSED DATA")
print("="*80)

X_train = joblib.load('../../data/preprocessed/X_train.pkl')
y_train = joblib.load('../../data/preprocessed/y_train.pkl')
X_test = joblib.load('../../data/preprocessed/X_test.pkl')
metadata = joblib.load('../../data/preprocessed/metadata.pkl')

print(f"✓ Train shape: {X_train.shape}")
print(f"✓ Target shape: {y_train.shape}")
print(f"✓ Test shape: {X_test.shape}")

# =============================================================================
# OPTIMIZATION SETTINGS
# =============================================================================
FAST_MODE = False  # Set to False for more thorough search
N_TRIALS = 15 if FAST_MODE else 100
N_FOLDS = 5

print(f"\n{'='*60}")
print(f"OPTIMIZATION SETTINGS")
print(f"{'='*60}")
print(f"Mode: {'FAST' if FAST_MODE else 'THOROUGH'}")
print(f"Trials: {N_TRIALS}")
print(f"Folds: {N_FOLDS}")

# =============================================================================
# OBJECTIVE FUNCTION
# =============================================================================
def objective_elasticnet(trial, X, y):
    """Objective function for ElasticNet optimization"""
    
    # Hyperparameters to optimize
    params = {
        'alpha': trial.suggest_float('alpha', 1e-3, 1e2, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
        'max_iter': trial.suggest_int('max_iter', 1000, 10000),
        'tol': trial.suggest_float('tol', 1e-5, 1e-2, log=True),
        'selection': trial.suggest_categorical('selection', ['cyclic', 'random']),
        'random_state': RANDOM_SEED,
    }

    start_time = time.time()

    print(f"\n{'='*60}")
    print(f"Trial {trial.number}")
    print(f"{'='*60}")
    print(f"Parametri testati: {params}")


    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        
        # Handle both pandas Series and numpy arrays
        if isinstance(y, pd.Series):
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        else:
            y_tr, y_val = y[train_idx], y[val_idx]

        model = ElasticNet(**params)
        
        try:
            model.fit(X_tr, y_tr)
            preds = np.clip(model.predict(X_val), 0, 100)
            rmse = np.sqrt(mean_squared_error(y_val, preds))
            scores.append(rmse)
        except Exception as e:
            # If model fails to converge, return a high penalty
            return 1e6
        
        print(f"  Fold {fold_idx + 1}: RMSE = {rmse:.4f}")
    
    optimization_time = time.time() - start_time
    mean_score = np.mean(scores)
    print(f"RMSE medio: {mean_score:.4f}")
    print(f"Elapsed Time: {optimization_time:.1f}s")
    print(f"{'='*60}\n")
    
    return mean_score

# =============================================================================
# RUN OPTIMIZATION
# =============================================================================
print("\n" + "="*80)
print("STARTING BAYESIAN OPTIMIZATION")
print("="*80)

start_time = time.time()

study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)

study.optimize(
    lambda trial: objective_elasticnet(trial, X_train, y_train),
    n_trials=N_TRIALS,
    show_progress_bar=True,
    n_jobs=1
)

optimization_time = time.time() - start_time

# =============================================================================
# RESULTS
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"Best RMSE: {study.best_value:.6f}")
print(f"Optimization time: {optimization_time:.1f}s")
print(f"\nBest parameters:")
for param, value in sorted(study.best_params.items()):
    print(f"  {param:20s}: {value}")

# =============================================================================
# SAVE RESULTS
# =============================================================================
# Save best parameters
best_params = study.best_params.copy()
best_params['random_state'] = RANDOM_SEED

joblib.dump(best_params, 'elasticnet_params.pkl')
print("\n✓ Parameters saved to: elasticnet_params.pkl")

# Save optimization history
history = pd.DataFrame({
    'trial': [t.number for t in study.trials],
    'value': [t.value for t in study.trials],
    'params': [str(t.params) for t in study.trials]
})
history.to_csv('elasticnet_history.csv', index=False)
print("✓ History saved to: elasticnet_history.csv")

# Save study object
joblib.dump(study, 'elasticnet_study.pkl')
print("✓ Study saved to: elasticnet_study.pkl")

# Save summary
summary = {
    'model': 'ElasticNet',
    'best_rmse': study.best_value,
    'n_trials': N_TRIALS,
    'n_folds': N_FOLDS,
    'optimization_time': optimization_time,
    'best_params': best_params
}
joblib.dump(summary, 'elasticnet_summary.pkl')
print("✓ Summary saved to: elasticnet_summary.pkl")

# =============================================================================
# TRAIN FINAL MODEL ON FULL TRAINING DATA
# =============================================================================
print("\n" + "="*80)
print("TRAINING FINAL MODEL ON FULL DATASET")
print("="*80)

final_model = ElasticNet(**best_params)

print("Training final model...")
final_model.fit(X_train, y_train)

# Save final model
joblib.dump(final_model, 'elasticnet_final_model.pkl')
print("\n✓ Final model saved to: elasticnet_final_model.pkl")

# =============================================================================
# GENERATE PREDICTIONS AND SUBMISSION FILE
# =============================================================================
print("\n" + "="*80)
print("GENERATING PREDICTIONS")
print("="*80)

# Make predictions on test set
test_predictions = final_model.predict(X_test)

# Clip predictions to valid range [0, 100]
test_predictions = np.clip(test_predictions, 0, 100)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': range(len(X_train), len(test_predictions) + len(X_train)),
    'exam_score': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)
print("✓ Submission file saved to: submission.csv")

print(f"\nSubmission statistics:")
print(f"  Min prediction: {test_predictions.min():.2f}")
print(f"  Max prediction: {test_predictions.max():.2f}")
print(f"  Mean prediction: {test_predictions.mean():.2f}")
print(f"  Median prediction: {np.median(test_predictions):.2f}")
print(f"  Std prediction: {test_predictions.std():.2f}")
print(f"  Shape: {submission.shape}")

# =============================================================================
# FEATURE IMPORTANCE (COEFFICIENTS)
# =============================================================================
print("\n" + "="*80)
print("FEATURE COEFFICIENTS")
print("="*80)

# Get feature coefficients
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': final_model.coef_
}).sort_values('coefficient', ascending=False, key=abs)

# Count non-zero coefficients (feature selection)
n_nonzero = (final_model.coef_ != 0).sum()
n_total = len(final_model.coef_)

print(f"\nFeature selection: {n_nonzero}/{n_total} features used ({n_nonzero/n_total*100:.1f}%)")
print("\nTop 10 features by absolute coefficient:")
print(feature_importance.head(10).to_string(index=False))

# Save feature importance
feature_importance.to_csv('elasticnet_coefficients.csv', index=False)
print("\n✓ Coefficients saved to: elasticnet_coefficients.csv")

# =============================================================================
# MODEL STATISTICS
# =============================================================================
print("\n" + "="*80)
print("MODEL STATISTICS")
print("="*80)
print(f"Intercept: {final_model.intercept_:.4f}")
print(f"Alpha (regularization strength): {best_params['alpha']:.6f}")
print(f"L1 ratio: {best_params['l1_ratio']:.4f}")
print(f"  (L1 ratio = 0 → Ridge, L1 ratio = 1 → Lasso)")
print(f"Number of iterations: {final_model.n_iter_}")
print(f"Number of features selected: {n_nonzero}/{n_total}")

# =============================================================================
# FINAL SUMMARY
# =============================================================================
print("\n" + "="*80)
print("ELASTICNET REGRESSION OPTIMIZATION COMPLETE!")
print("="*80)
print(f"✓ Best CV RMSE: {study.best_value:.6f}")
print(f"✓ Final model trained on {len(X_train)} samples")
print(f"✓ Predictions generated for {len(X_test)} test samples")
print(f"✓ All results saved")
print("\nFiles created:")
print("  • elasticnet_params.pkl")
print("  • elasticnet_history.csv")
print("  • elasticnet_study.pkl")
print("  • elasticnet_summary.pkl")
print("  • elasticnet_final_model.pkl")
print("  • elasticnet_coefficients.csv")
print("  • submission.csv")
print("="*80)

ELASTICNET REGRESSION HYPERPARAMETER OPTIMIZATION

LOADING PREPROCESSED DATA
✓ Train shape: (630000, 8)
✓ Target shape: (630000,)
✓ Test shape: (270000, 8)

OPTIMIZATION SETTINGS
Mode: THOROUGH
Trials: 100
Folds: 5

STARTING BAYESIAN OPTIMIZATION


  0%|          | 0/100 [00:00<?, ?it/s]


Trial 0
Parametri testati: {'alpha': 0.5547119471592125, 'l1_ratio': 0.7151893663724195, 'fit_intercept': True, 'max_iter': 4813, 'tol': 0.0008663279761354553, 'selection': 'random', 'random_state': 0}
  Fold 1: RMSE = 8.8946
  Fold 2: RMSE = 8.9002
  Fold 3: RMSE = 8.8716
  Fold 4: RMSE = 8.9097
  Fold 5: RMSE = 8.9026
RMSE medio: 8.8957
Elapsed Time: 15.7s


Trial 1
Parametri testati: {'alpha': 65.81332043291806, 'l1_ratio': 0.3834415188257777, 'fit_intercept': True, 'max_iter': 6112, 'tol': 0.0059812219011525555, 'selection': 'random', 'random_state': 0}
  Fold 1: RMSE = 9.8627
  Fold 2: RMSE = 9.8945
  Fold 3: RMSE = 9.8633
  Fold 4: RMSE = 9.9156
  Fold 5: RMSE = 9.9199
RMSE medio: 9.8912
Elapsed Time: 1.4s


Trial 2
Parametri testati: {'alpha': 0.0012620948285169307, 'l1_ratio': 0.832619845547938, 'fit_intercept': False, 'max_iter': 9808, 'tol': 0.0024973286104060573, 'selection': 'random', 'random_state': 0}
  Fold 1: RMSE = 8.8912
  Fold 2: RMSE = 8.8992
  Fold 3: RMSE = 8.869