In [1]:
import numpy as np
import pandas as pd
import warnings
import json
import time
from tqdm import tqdm
from scipy.optimize import minimize
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.isotonic import IsotonicRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

warnings.filterwarnings("ignore")
np.random.seed(42)

# =============================================================================
# GPU CONFIGURATION
# =============================================================================
# import torch
# print("="*80)
# print("GPU CONFIGURATION")
# print("="*80)
# if torch.cuda.is_available():
#     print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
#     print(f"✓ CUDA version: {torch.cuda.get_device_properties(0).major}.{torch.cuda.get_device_properties(0).minor}")
#     print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
#     USE_GPU = True
# else:
#     print("⚠ No GPU detected, using CPU")
USE_GPU = False

In [2]:
# =============================================================================
# 1. LOAD DATA
# =============================================================================
print("\n" + "="*80)
print("LOADING DATA")
print("="*80)

train_df = pd.read_csv("../data/train.csv")[:500]
test_df = pd.read_csv("../data/test.csv")[:100]
original_df = pd.read_csv("../data/Exam_Score_Prediction.csv")[:200]

TARGET = "exam_score"
ID_COL = "id"

print(f"Train: {train_df.shape}, Test: {test_df.shape}, Original: {original_df.shape}")


LOADING DATA
Train: (500, 13), Test: (100, 12), Original: (200, 13)


In [3]:
# =============================================================================
# 2. ADVERSARIAL VALIDATION
# =============================================================================
print("\n" + "="*80)
print("ADVERSARIAL VALIDATION - Detecting Distribution Shift")
print("="*80)

def adversarial_validation(train, test, features):
    """Identify train-test distribution differences"""
    train_adv = train[features].copy()
    test_adv = test[features].copy()

    train_adv['is_test'] = 0
    test_adv['is_test'] = 1

    combined = pd.concat([train_adv, test_adv], axis=0, ignore_index=True)

    for col in combined.select_dtypes(include=['object']).columns:
        if col != 'is_test':
            combined[col] = combined[col].astype('category').cat.codes

    X = combined.drop('is_test', axis=1)
    y = combined['is_test']

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
        model.fit(X_tr, y_tr)
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y, oof_preds)
    print(f"Adversarial AUC: {auc:.4f}")

    if auc > 0.6:
        print("⚠️  Significant distribution shift detected!")
        model = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
        model.fit(X, y)
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\nTop features causing shift:")
        print(importance_df.head(10))
    else:
        print("✓ Distribution shift is minimal")

    return auc

base_features = [c for c in train_df.columns if c not in [ID_COL, TARGET]]
adv_auc = adversarial_validation(train_df, test_df, base_features)



ADVERSARIAL VALIDATION - Detecting Distribution Shift
Adversarial AUC: 0.4861
✓ Distribution shift is minimal


In [4]:
train_df.head()

Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


In [5]:
# =============================================================================
# 3. OPTIMIZED FEATURE ENGINEERING (EDA-DRIVEN)
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZED FEATURE ENGINEERING (EDA-DRIVEN)")
print("="*80)

def create_optimized_features(df):
    """
    Create high-impact features based strictly on EDA findings.
    Focus: Quality over Quantity - Only features with theoretical justification.
    """
    df = df.copy()
    eps = 1e-5
    
    # =========================================================================
    # TIER 1: CRITICAL FEATURES (Core predictors from EDA)
    # =========================================================================
    print("  Creating Tier 1: Critical features...")
    
    # Polynomial features (degree 2 only)
    df['study_sq'] = df['study_hours'] ** 2
    df['attendance_sq'] = df['class_attendance'] ** 2
    
    # Key interactions
    df['study_x_attendance'] = df['study_hours'] * df['class_attendance']
    df['study_x_sleep'] = df['study_hours'] * df['sleep_hours']
    
    # Efficiency metrics
    df['efficiency'] = (df['study_hours'] * df['class_attendance']) / (df['sleep_hours'] + 1)
    df['efficiency_sq'] = df['efficiency'] ** 2
    
    # Weighted effort (EDA-informed weights)
    df['weighted_effort'] = (0.06 * df['class_attendance'] + 
                             2.0 * df['study_hours'] + 
                             1.2 * df['sleep_hours'])
    
    # =========================================================================
    # TIER 2: HIGH-IMPACT CATEGORICAL FEATURES (η² > 2% from EDA)
    # =========================================================================
    print("  Creating Tier 2: High-impact categorical features...")
    
    # Ordinal encoding for features with clear monotonic relationships
    sleep_quality_map = {'poor': 0, 'average': 1, 'good': 2}
    df['sleep_quality_ord'] = df['sleep_quality'].map(sleep_quality_map).fillna(1)
    
    study_method_map = {'self-study': 0, 'online videos': 1, 'group study': 2, 
                        'mixed': 3, 'coaching': 4}
    df['study_method_ord'] = df['study_method'].map(study_method_map).fillna(2)
    
    facility_map = {'low': 0, 'medium': 1, 'high': 2}
    df['facility_ord'] = df['facility_rating'].map(facility_map).fillna(1)
    
    difficulty_map = {'easy': 0, 'moderate': 1, 'hard': 2}
    df['difficulty_ord'] = df['exam_difficulty'].map(difficulty_map).fillna(1)
    
    # =========================================================================
    # TIER 3: CATEGORICAL × NUMERIC INTERACTIONS
    # =========================================================================
    print("  Creating Tier 3: Categorical × numeric interactions...")
    
    df['sleep_quality_x_study'] = df['sleep_quality_ord'] * df['study_hours']
    df['facility_x_attendance'] = df['facility_ord'] * df['class_attendance']
    df['study_method_x_hours'] = df['study_method_ord'] * df['study_hours']
    df['difficulty_x_efficiency'] = df['difficulty_ord'] * df['efficiency']
    
    # =========================================================================
    # TIER 4: DOMAIN-SPECIFIC FLAGS
    # =========================================================================
    print("  Creating Tier 4: Domain-specific flags...")
    
    df['ideal_sleep'] = ((df['sleep_hours'] >= 7) & (df['sleep_hours'] <= 9)).astype(int)
    df['sleep_deprived'] = (df['sleep_hours'] <= 5.5).astype(int)
    df['high_performer'] = ((df['study_hours'] >= 6) & (df['class_attendance'] >= 85)).astype(int)
    df['at_risk'] = ((df['study_hours'] <= 3) | (df['class_attendance'] <= 60)).astype(int)
    
    # =========================================================================
    # TIER 5: BINNING FEATURES
    # =========================================================================
    print("  Creating Tier 5: Intelligent binning...")
    
    df['study_bin'] = pd.cut(df['study_hours'], bins=[-0.1, 2, 4, 6, 8], labels=False).astype(float)
    df['attendance_bin'] = pd.cut(df['class_attendance'], bins=[40, 60, 75, 85, 95, 100], labels=False).astype(float)
    df['sleep_bin'] = pd.cut(df['sleep_hours'], bins=[4, 5.5, 7, 8.5, 10], labels=False).astype(float)
    
    # =========================================================================
    # TIER 6: SELECTIVE TRANSFORMATIONS
    # =========================================================================
    print("  Creating Tier 6: Selective transformations...")
    
    df['log_study_hours'] = np.log1p(df['study_hours'])
    df['sqrt_attendance'] = np.sqrt(df['class_attendance'])
    df['age_rank'] = df['age'].rank(pct=True)
    
    # =========================================================================
    # TIER 7: DISTANCE FROM OPTIMAL
    # =========================================================================
    print("  Creating Tier 7: Distance from optimal...")
    
    df['sleep_from_optimal'] = np.abs(df['sleep_hours'] - 8)
    df['study_from_optimal'] = np.abs(df['study_hours'] - 6)
    df['attendance_from_optimal'] = np.abs(df['class_attendance'] - 95)
    
    # =========================================================================
    # TIER 8: RATIOS
    # =========================================================================
    print("  Creating Tier 8: Meaningful ratios...")
    
    df['study_per_sleep'] = df['study_hours'] / (df['sleep_hours'] + eps)
    df['geometric_mean'] = (
        (df['study_hours'] + 1) *
        (df['class_attendance'] + 1) *
        (df['sleep_hours'] + 1)
    ) ** (1/3)
    
    print("✓ Feature engineering complete!")
    return df

# =============================================================================
# Apply feature engineering
# =============================================================================
print("\nApplying optimized feature engineering...")
train_fe = create_optimized_features(train_df)
test_fe = create_optimized_features(test_df)
original_fe = create_optimized_features(original_df)

y = train_df[TARGET].clip(0, 100).values
y_orig = original_df[TARGET].clip(0, 100).values

# Categorical features for target encoding (only high-impact from EDA)
cat_features = [
    'sleep_quality',      # η² = 5.6%
    'study_method',       # η² = 5.0%
    'facility_rating',    # η² = 3.6%
]

# Low-impact categoricals to remove
low_impact_cats = [
    'gender',           # η² < 0.2%
    'course',           # η² < 0.2%
    'internet_access',  # η² < 0.2%
    'exam_difficulty',  # We have difficulty_ord instead
]

print(f"\n{'='*60}")
print(f"FEATURE ENGINEERING SUMMARY")
print(f"{'='*60}")
print(f"High-impact categoricals (for target encoding): {cat_features}")
print(f"Low-impact categoricals (will be removed): {low_impact_cats}")


OPTIMIZED FEATURE ENGINEERING (EDA-DRIVEN)

Applying optimized feature engineering...
  Creating Tier 1: Critical features...
  Creating Tier 2: High-impact categorical features...
  Creating Tier 3: Categorical × numeric interactions...
  Creating Tier 4: Domain-specific flags...
  Creating Tier 5: Intelligent binning...
  Creating Tier 6: Selective transformations...
  Creating Tier 7: Distance from optimal...
  Creating Tier 8: Meaningful ratios...
✓ Feature engineering complete!
  Creating Tier 1: Critical features...
  Creating Tier 2: High-impact categorical features...
  Creating Tier 3: Categorical × numeric interactions...
  Creating Tier 4: Domain-specific flags...
  Creating Tier 5: Intelligent binning...
  Creating Tier 6: Selective transformations...
  Creating Tier 7: Distance from optimal...
  Creating Tier 8: Meaningful ratios...
✓ Feature engineering complete!
  Creating Tier 1: Critical features...
  Creating Tier 2: High-impact categorical features...
  Creating Tie

In [6]:
# =============================================================================
# 4. TARGET ENCODING (BEFORE feature selection)
# =============================================================================
print("\n" + "="*80)
print("TARGET ENCODING ON CATEGORICAL FEATURES")
print("="*80)

from sklearn.model_selection import KFold

def target_encode_cv(X_train, X_test, y_train, cat_cols, n_splits=5, alpha=10):
    """
    Target encoding with CV to prevent leakage.
    Alpha = smoothing parameter (higher = more regularization)
    """
    X_train_enc = X_train.copy()
    X_test_enc = X_test.copy()
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    global_mean = y_train.mean()
    
    for col in cat_cols:
        print(f"  Encoding {col}...")
        
        # For train: use CV to prevent leakage
        X_train_enc[f'{col}_target'] = 0.0
        
        for train_idx, val_idx in kf.split(X_train):
            X_tr = X_train.iloc[train_idx]
            y_tr = y_train[train_idx]
            
            # Create temporary DataFrame to align X and y
            df_fold = pd.DataFrame({
                'category': X_tr[col].values,
                'target': y_tr
            })
            
            # Calculate mean encoding on training fold
            target_means = df_fold.groupby('category')['target'].agg(['mean', 'count'])
            target_means['target_enc'] = (
                (target_means['mean'] * target_means['count'] + global_mean * alpha) / 
                (target_means['count'] + alpha)
            )
            
            # Apply to validation fold
            X_train_enc.loc[val_idx, f'{col}_target'] = (
                X_train.iloc[val_idx][col]
                .map(target_means['target_enc'])
                .fillna(global_mean)
            )
        
        # For test: use all training data to calculate means
        df_full = pd.DataFrame({
            'category': X_train[col].values,
            'target': y_train
        })
        
        target_means_full = df_full.groupby('category')['target'].agg(['mean', 'count'])
        target_means_full['target_enc'] = (
            (target_means_full['mean'] * target_means_full['count'] + global_mean * alpha) / 
            (target_means_full['count'] + alpha)
        )
        
        X_test_enc[f'{col}_target'] = (
            X_test[col]
            .map(target_means_full['target_enc'])
            .fillna(global_mean)
        )
        
        # Remove original column (now we have the encoded version)
        X_train_enc = X_train_enc.drop(columns=[col])
        X_test_enc = X_test_enc.drop(columns=[col])
    
    return X_train_enc, X_test_enc


def target_encode_simple(X, y, cat_cols, alpha=10):
    """Simple target encoding (no CV) for small datasets"""
    X_enc = X.copy()
    global_mean = y.mean()
    
    for col in cat_cols:
        print(f"  Encoding {col} (simple)...")
        
        df_temp = pd.DataFrame({
            'category': X[col].values,
            'target': y
        })
        
        target_means = df_temp.groupby('category')['target'].agg(['mean', 'count'])
        target_means['target_enc'] = (
            (target_means['mean'] * target_means['count'] + global_mean * alpha) / 
            (target_means['count'] + alpha)
        )
        
        X_enc[f'{col}_target'] = X[col].map(target_means['target_enc']).fillna(global_mean)
        X_enc = X_enc.drop(columns=[col])
    
    return X_enc


# ✅ Apply target encoding - PASS COMPLETE DATAFRAMES
print("\nTarget encoding TRAIN and TEST (with CV)...")
X_train_init, X_test_init = target_encode_cv(
    train_fe,  # ✅ Pass complete DataFrame (includes categoricals)
    test_fe,   # ✅ Pass complete DataFrame
    pd.Series(y, name='exam_score'),
    cat_features,
    n_splits=5,
    alpha=10
)

print("\nTarget encoding ORIGINAL (simple, no CV)...")
X_orig_init = target_encode_simple(
    original_fe,  # ✅ Pass complete DataFrame
    pd.Series(y_orig, name='exam_score'),
    cat_features,
    alpha=10
)

# ✅ NOW remove low-impact categoricals + ID + TARGET
cols_to_remove = [ID_COL, TARGET] + low_impact_cats
feature_cols = [c for c in X_train_init.columns if c not in cols_to_remove]

X_train_init = X_train_init[feature_cols]
X_test_init = X_test_init[feature_cols]
X_orig_init = X_orig_init[feature_cols]

# Convert to float32
X_train_init = X_train_init.astype(np.float32)
X_test_init = X_test_init.astype(np.float32)
X_orig_init = X_orig_init.astype(np.float32)

print(f"\n✓ Target encoding complete. Shape: {X_train_init.shape}")
print(f"✓ Features after cleanup: {len(feature_cols)}")

# =============================================================================
# 4.5. FEATURE SELECTION (OPTIONAL with ~40 features)
# =============================================================================
print("\n" + "="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

print("Training quick LightGBM for feature importance...")

X_temp = X_train_init.copy()
y_temp = y.copy()

kf_temp = KFold(n_splits=3, shuffle=True, random_state=42)
feature_importance = np.zeros(X_temp.shape[1])

for fold, (train_idx, val_idx) in enumerate(kf_temp.split(X_temp)):
    X_tr, X_val = X_temp.iloc[train_idx], X_temp.iloc[val_idx]
    y_tr, y_val = y_temp[train_idx], y_temp[val_idx]

    lgb_params_quick = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'device': 'gpu' if USE_GPU else 'cpu',
        'seed': 42
    }

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params_quick, train_data,
        num_boost_round=100,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(20), lgb.log_evaluation(0)]
    )

    feature_importance += model.feature_importance(importance_type='gain')
    print(f"  Fold {fold+1}/3 done")

# Average importance
feature_importance /= 3

# Create DataFrame with importance
importance_df = pd.DataFrame({
    'feature': X_temp.columns,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

# Calculate cumulative importance
importance_df['cumulative_importance'] = importance_df['importance'].cumsum() / importance_df['importance'].sum()

print(f"\nTotal features: {len(importance_df)}")
print(f"\nTop 20 most important features:")
print(importance_df.head(20))

# Feature selection strategy
print("\n" + "="*60)
print("FEATURE SELECTION STRATEGY")
print("="*60)
print(f"Given we have {len(feature_cols)} well-designed features,")
print(f"we'll use a CONSERVATIVE threshold to keep most features.")
print()

# Choose strategy
USE_FEATURE_SELECTION = True  # Set to False to keep all features

if USE_FEATURE_SELECTION:
    # Keep features covering 98% of importance
    threshold = 0.98
    selected_features = importance_df[importance_df['cumulative_importance'] <= threshold]['feature'].tolist()
    
    # Ensure at least 3 features
    if len(selected_features) < 3:
        selected_features = importance_df.head(3)['feature'].tolist()
else:
    selected_features = feature_cols
    print("Keeping ALL features (no selection)")

print(f"\n{'='*60}")
print(f"FEATURE SELECTION RESULTS")
print(f"{'='*60}")
print(f"Original features: {len(feature_cols)}")
print(f"Selected features: {len(selected_features)}")
print(f"Reduction: {100*(1 - len(selected_features)/len(feature_cols)):.1f}%")

# Update datasets
X_train_init = X_train_init[selected_features]
X_test_init = X_test_init[selected_features]
X_orig_init = X_orig_init[selected_features]
feature_cols = selected_features

print(f"\n✓ Feature selection complete!")
print(f"✓ Final dataset shape: {X_train_init.shape}")

# Save feature importance
importance_df.to_csv('../results/feature_importance.csv', index=False)
print("✓ Feature importance saved to feature_importance.csv")


TARGET ENCODING ON CATEGORICAL FEATURES

Target encoding TRAIN and TEST (with CV)...
  Encoding sleep_quality...
  Encoding study_method...
  Encoding facility_rating...

Target encoding ORIGINAL (simple, no CV)...
  Encoding sleep_quality (simple)...
  Encoding study_method (simple)...
  Encoding facility_rating (simple)...

✓ Target encoding complete. Shape: (500, 37)
✓ Features after cleanup: 37

FEATURE IMPORTANCE ANALYSIS
Training quick LightGBM for feature importance...
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 9.60017
  Fold 1/3 done
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[79]	valid_0's rmse: 8.98488
  Fold 2/3 done
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[97]	valid_0's rmse: 9.06568
  Fold 3/3 done

Total features: 37

Top 20 most important features:
                    feature  

In [7]:
# =============================================================================
# 5. BAYESIAN HYPERPARAMETER OPTIMIZATION - ALL MODELS
# =============================================================================
print("\n" + "="*80)
print("BAYESIAN HYPERPARAMETER OPTIMIZATION - ALL MODELS")
print("="*80)

start_time = time.time()

# VELOCITÀ vs PRECISIONE trade-off
FAST_MODE = True

if FAST_MODE:
    trials_config = {
        'lgb': 20,      # era 40
        'xgb': 25,      # era 50
        'cat': 10,      # era 10
        'et': 5,        # era 5
        'gbr': 5,       # era 10
        'ridge': 15,    # era 30
        'elastic': 15,  # era 30
        'svr': 5        # era 10
    }
else:
    trials_config = {
        'lgb': 50, 'xgb': 50, 'cat': 20,
        'et': 20, 'gbr': 20, 'ridge': 40,
        'elastic': 40, 'svr': 20
    }

# Numero di folds
FOLDS = 4

# Dictionary to store all best parameters
best_params_dict = {}

# --- LIGHTGBM ---
def objective_lgb(trial, X, y):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'device': 'gpu' if USE_GPU else 'cpu',
        'gpu_use_dp': False if USE_GPU else None,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 60),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 0.95),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 0.95),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 1.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 1.0),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 1.0),
    }
    params = {k: v for k, v in params.items() if v is not None}

    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    # Progress bar per i fold
    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        model = lgb.train(
            params, train_data, num_boost_round=1000,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )

        preds = np.clip(model.predict(X_val), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[1/8] Optimizing LightGBM...")
study_lgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_lgb.optimize(lambda trial: objective_lgb(trial, X_train_init, y), 
                   n_trials=trials_config['lgb'], 
                   show_progress_bar=True, 
                   n_jobs=1)
best_params_dict['lightgbm'] = study_lgb.best_params
print(f"✓ Best LightGBM RMSE: {study_lgb.best_value:.6f}")

# --- XGBOOST ---
def objective_xgb(trial, X, y):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'device': 'cuda:0' if USE_GPU else 'cpu',
        'predictor': 'gpu_predictor' if USE_GPU else 'cpu_predictor',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.95),
        'gamma': trial.suggest_float('gamma', 0, 1.0),
        'lambda': trial.suggest_float('lambda', 0, 2.0),
        'alpha': trial.suggest_float('alpha', 0, 2.0),
        'seed': 42
    }
    params = {k: v for k, v in params.items() if v is not None}

    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params, dtrain, num_boost_round=1000,
            evals=[(dval, 'eval')],
            early_stopping_rounds=50,
            verbose_eval=False
        )

        preds = np.clip(model.predict(dval), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[2/8] Optimizing XGBoost...")
study_xgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_xgb.optimize(lambda trial: objective_xgb(trial, X_train_init, y), 
                   n_trials=trials_config['xgb'], 
                   show_progress_bar=True, 
                   n_jobs=1)
best_params_dict['xgboost'] = study_xgb.best_params
print(f"✓ Best XGBoost RMSE: {study_xgb.best_value:.6f}")

# --- CATBOOST ---
def objective_cat(trial, X, y):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 2),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'task_type': 'GPU' if USE_GPU else 'CPU',
        'devices': '0' if USE_GPU else None,
        'verbose': False,
        'random_seed': 42
    }

    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)

        preds = np.clip(model.predict(X_val), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[3/8] Optimizing CatBoost...")
study_cat = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_cat.optimize(lambda trial: objective_cat(trial, X_train_init, y), 
                   n_trials=trials_config['cat'], 
                   show_progress_bar=True, 
                   n_jobs=1)
best_params_dict['catboost'] = study_cat.best_params
print(f"✓ Best CatBoost RMSE: {study_cat.best_value:.6f}")

# --- EXTRATREES ---
def objective_et(trial, X, y):
    # Subsample per velocità
    sample_size = int(len(X) * 0.3)
    idx = np.random.choice(len(X), sample_size, replace=False)
    X_sample = X.iloc[idx]
    y_sample = y[idx]

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 10, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 10, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 4, 10),
        'max_features': trial.suggest_float('max_features', 0.6, 0.9),
        'random_state': 42,
        'n_jobs': -1
    }

    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X_sample), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X_sample.iloc[train_idx], X_sample.iloc[val_idx]
        y_tr, y_val = y_sample[train_idx], y_sample[val_idx]

        model = ExtraTreesRegressor(**params)
        model.fit(X_tr, y_tr)

        preds = np.clip(model.predict(X_val), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[4/8] Optimizing ExtraTrees...")
study_et = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_et.optimize(lambda trial: objective_et(trial, X_train_init, y), 
                  n_trials=trials_config['et'], 
                  show_progress_bar=True, 
                  n_jobs=1)
best_params_dict['extratrees'] = study_et.best_params
print(f"✓ Best ExtraTrees RMSE: {study_et.best_value:.6f}")

# --- GRADIENT BOOSTING ---
def objective_gbr(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'min_samples_split': trial.suggest_int('min_samples_split', 10, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 4, 10),
        'subsample': trial.suggest_float('subsample', 0.7, 0.9),
        'max_features': trial.suggest_float('max_features', 0.6, 0.9),
        'random_state': 42
    }

    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = GradientBoostingRegressor(**params)
        model.fit(X_tr, y_tr)

        preds = np.clip(model.predict(X_val), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[5/8] Optimizing GradientBoosting...")
study_gbr = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_gbr.optimize(lambda trial: objective_gbr(trial, X_train_init, y), 
                   n_trials=trials_config['gbr'], 
                   show_progress_bar=True, 
                   n_jobs=4)
best_params_dict['gradientboosting'] = study_gbr.best_params
print(f"✓ Best GradientBoosting RMSE: {study_gbr.best_value:.6f}")

# --- RIDGE ---
def objective_ridge(trial, X, y):
    alpha = trial.suggest_float('alpha', 0.1, 100, log=True)

    scaler = RobustScaler()
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        X_tr_scaled = scaler.fit_transform(X_tr)
        X_val_scaled = scaler.transform(X_val)

        model = Ridge(alpha=alpha, random_state=42)
        model.fit(X_tr_scaled, y_tr)

        preds = np.clip(model.predict(X_val_scaled), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[6/8] Optimizing Ridge...")
study_ridge = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_ridge.optimize(lambda trial: objective_ridge(trial, X_train_init, y), 
                     n_trials=trials_config['ridge'], 
                     show_progress_bar=True, 
                     n_jobs=4)
best_params_dict['ridge'] = {'alpha': study_ridge.best_params['alpha']}
print(f"✓ Best Ridge RMSE: {study_ridge.best_value:.6f}")

# --- ELASTICNET ---
def objective_elastic(trial, X, y):
    alpha = trial.suggest_float('alpha', 0.01, 10, log=True)
    l1_ratio = trial.suggest_float('l1_ratio', 0.1, 0.9)

    scaler = RobustScaler()
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        X_tr_scaled = scaler.fit_transform(X_tr)
        X_val_scaled = scaler.transform(X_val)

        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=5000, random_state=42)
        model.fit(X_tr_scaled, y_tr)

        preds = np.clip(model.predict(X_val_scaled), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[7/8] Optimizing ElasticNet...")
study_elastic = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_elastic.optimize(lambda trial: objective_elastic(trial, X_train_init, y), 
                       n_trials=trials_config['elastic'], 
                       show_progress_bar=True, 
                       n_jobs=4)
best_params_dict['elasticnet'] = study_elastic.best_params
print(f"✓ Best ElasticNet RMSE: {study_elastic.best_value:.6f}")

# --- SVR ---
def objective_svr(trial, X, y):
    C = trial.suggest_float('C', 1, 100, log=True)
    epsilon = trial.suggest_float('epsilon', 0.01, 1.0)
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

    scaler = StandardScaler()
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(kf.split(X), 
                                                           total=FOLDS, 
                                                           desc=f"Trial {trial.number}", 
                                                           leave=False)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        X_tr_scaled = scaler.fit_transform(X_tr)
        X_val_scaled = scaler.transform(X_val)

        # Subsample per velocità (SVR è lento)
        sample_size = min(50000, len(X_tr))
        idx = np.random.choice(len(X_tr), sample_size, replace=False)

        model = SVR(C=C, epsilon=epsilon, gamma=gamma, kernel='rbf')
        model.fit(X_tr_scaled[idx], y_tr[idx])

        preds = np.clip(model.predict(X_val_scaled), 0, 100)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        scores.append(rmse)

    return np.mean(scores)

print("\n[8/8] Optimizing SVR (using subsampling for speed)...")
study_svr = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study_svr.optimize(lambda trial: objective_svr(trial, X_train_init, y), 
                   n_trials=trials_config['svr'], 
                   show_progress_bar=True, 
                   n_jobs=2)
best_params_dict['svr'] = study_svr.best_params
print(f"✓ Best SVR RMSE: {study_svr.best_value:.6f}")

# =============================================================================
# SUMMARY
# =============================================================================
print("\n" + "="*80)
print("OPTIMIZATION SUMMARY")
print("="*80)

results = []
for model_name, study in [
    ('LightGBM', study_lgb),
    ('XGBoost', study_xgb),
    ('CatBoost', study_cat),
    ('ExtraTrees', study_et),
    ('GradientBoosting', study_gbr),
    ('Ridge', study_ridge),
    ('ElasticNet', study_elastic),
    ('SVR', study_svr)
]:
    results.append({
        'Model': model_name,
        'Best RMSE': f"{study.best_value:.6f}",
        'Trials': len(study.trials)
    })

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

print("\n✓ All hyperparameters saved in best_params_dict")
print(f"✓ Total optimization time: {time.time() - start_time:.1f}s")


BAYESIAN HYPERPARAMETER OPTIMIZATION - ALL MODELS

[1/8] Optimizing LightGBM...


  0%|          | 0/20 [00:00<?, ?it/s]



Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[105]	valid_0's rmse: 8.98253
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[223]	valid_0's rmse: 9.96857
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[134]	valid_0's rmse: 8.32107
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[160]	valid_0's rmse: 9.40645




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[338]	valid_0's rmse: 9.00805
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[408]	valid_0's rmse: 9.88616
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[313]	valid_0's rmse: 8.23923
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[333]	valid_0's rmse: 9.50445




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 9.2423
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[82]	valid_0's rmse: 9.94467
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[92]	valid_0's rmse: 8.21376
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[73]	valid_0's rmse: 9.47721




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[95]	valid_0's rmse: 9.05176
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[108]	valid_0's rmse: 10.0134
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[118]	valid_0's rmse: 8.23223




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[83]	valid_0's rmse: 9.38609




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[204]	valid_0's rmse: 9.17008
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[231]	valid_0's rmse: 10.0114
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[299]	valid_0's rmse: 8.26601
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[344]	valid_0's rmse: 9.42431




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	valid_0's rmse: 9.29083
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	valid_0's rmse: 9.99669
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[57]	valid_0's rmse: 8.37574
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	valid_0's rmse: 9.62772




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[105]	valid_0's rmse: 9.15659
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[110]	valid_0's rmse: 9.98535
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[122]	valid_0's rmse: 8.1857
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[88]	valid_0's rmse: 9.56786




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[50]	valid_0's rmse: 9.06141
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[72]	valid_0's rmse: 9.87331
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[58]	valid_0's rmse: 8.38651
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[96]	valid_0's rmse: 9.35901




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[84]	valid_0's rmse: 9.2807
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 9.89368
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[37]	valid_0's rmse: 8.63345
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[40]	valid_0's rmse: 9.60982




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[228]	valid_0's rmse: 8.98457
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[308]	valid_0's rmse: 9.88538
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[241]	valid_0's rmse: 8.10912
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[263]	valid_0's rmse: 9.57178




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[160]	valid_0's rmse: 9.1375
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[179]	valid_0's rmse: 10.1233
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[148]	valid_0's rmse: 8.46356
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[256]	valid_0's rmse: 9.38502




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[332]	valid_0's rmse: 8.996
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[287]	valid_0's rmse: 9.99356
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[353]	valid_0's rmse: 8.25193
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[373]	valid_0's rmse: 9.43241




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[124]	valid_0's rmse: 9.02059
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[133]	valid_0's rmse: 9.94194
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[266]	valid_0's rmse: 8.17917
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[124]	valid_0's rmse: 9.46193




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[105]	valid_0's rmse: 9.12269
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[165]	valid_0's rmse: 9.94514
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[135]	valid_0's rmse: 8.23101
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[135]	valid_0's rmse: 9.31829




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[98]	valid_0's rmse: 9.06484
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[169]	valid_0's rmse: 9.98803




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[160]	valid_0's rmse: 8.51307
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[109]	valid_0's rmse: 9.41905




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[139]	valid_0's rmse: 9.19816
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[151]	valid_0's rmse: 9.91738
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[253]	valid_0's rmse: 8.08714
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[212]	valid_0's rmse: 9.35018




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[84]	valid_0's rmse: 9.20022
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[131]	valid_0's rmse: 10.2279
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[95]	valid_0's rmse: 8.43553
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[81]	valid_0's rmse: 9.34916




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[190]	valid_0's rmse: 9.21228




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[164]	valid_0's rmse: 10.0163
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[157]	valid_0's rmse: 8.27148
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[140]	valid_0's rmse: 9.57489




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[77]	valid_0's rmse: 9.12376
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[138]	valid_0's rmse: 9.91018
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[94]	valid_0's rmse: 8.38462
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[105]	valid_0's rmse: 9.35022




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[192]	valid_0's rmse: 9.25249
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[132]	valid_0's rmse: 10.043
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[187]	valid_0's rmse: 8.51522
Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[227]	valid_0's rmse: 9.61603
✓ Best LightGBM RMSE: 9.137712

[2/8] Optimizing XGBoost...


  0%|          | 0/25 [00:00<?, ?it/s]



✓ Best XGBoost RMSE: 9.228942

[3/8] Optimizing CatBoost...


  0%|          | 0/10 [00:00<?, ?it/s]



✓ Best CatBoost RMSE: 9.116181

[4/8] Optimizing ExtraTrees...


  0%|          | 0/5 [00:00<?, ?it/s]



✓ Best ExtraTrees RMSE: 9.509916

[5/8] Optimizing GradientBoosting...


  0%|          | 0/5 [00:00<?, ?it/s]


[A

[A[A


[A[A[A
[A


[A[A[A

[A[A
[A


[A[A[A

[A[A
[A


[A[A[A
[A
[A

[A[A


[A[A[A


[A[A[A

[A[A

[A[A

✓ Best GradientBoosting RMSE: 9.218248

[6/8] Optimizing Ridge...


  0%|          | 0/15 [00:00<?, ?it/s]



[A[A
[A


[A[A[A

[A[A
[A


[A[A[A

[A[A
[A
[A


[A[A[A

[A[A


[A[A[A
[A

[A[A


[A[A[A
[A

[A[A
[A


[A[A[A
[A

[A[A


[A[A[A

[A[A
[A


[A[A[A
[A

[A[A

[A[A
[A


[A[A[A

[A[A
[A
[A

[A[A

✓ Best Ridge RMSE: 8.935643

[7/8] Optimizing ElasticNet...


  0%|          | 0/15 [00:00<?, ?it/s]




[A[A[A

[A[A
[A


[A[A[A

[A[A
[A


[A[A[A


[A[A[A

[A[A

[A[A
[A
[A


[A[A[A


[A[A[A

[A[A
[A


[A[A[A

[A[A
[A

[A[A
[A


[A[A[A

[A[A


[A[A[A
[A


[A[A[A

[A[A
[A
[A


[A[A[A


[A[A[A
[A
[A

✓ Best ElasticNet RMSE: 8.945330

[8/8] Optimizing SVR (using subsampling for speed)...


  0%|          | 0/5 [00:00<?, ?it/s]


[A
[A
[A
[A
[A

✓ Best SVR RMSE: 9.529466

OPTIMIZATION SUMMARY
           Model Best RMSE  Trials
        LightGBM  9.137712      20
         XGBoost  9.228942      25
        CatBoost  9.116181      10
      ExtraTrees  9.509916       5
GradientBoosting  9.218248       5
           Ridge  8.935643      15
      ElasticNet  8.945330      15
             SVR  9.529466       5

✓ All hyperparameters saved in best_params_dict
✓ Total optimization time: 78.7s


In [8]:
# =============================================================================
# 6. SAVE BEST PARAMETERS
# =============================================================================
print("\n" + "="*80)
print("SAVING BEST PARAMETERS")
print("="*80)

# Save as JSON
with open('../results/best_hyperparameters.json', 'w') as f:
    json.dump(best_params_dict, f, indent=4)
print("✓ Saved to: results/best_hyperparameters.json")

# Save as CSV for easy viewing
params_df = pd.DataFrame([
    {'model': model, 'parameter': param, 'value': value}
    for model, params in best_params_dict.items()
    for param, value in params.items()
])
params_df.to_csv('../results/best_hyperparameters.csv', index=False)
print("✓ Saved to: best_hyperparameters.csv")

print("\nBest Parameters Summary:")
for model, params in best_params_dict.items():
    print(f"\n{model.upper()}:")
    for param, value in params.items():
        print(f"  {param}: {value}")

print(f"\n{'='*80}")
print("HYPERPARAMETER OPTIMIZATION COMPLETE")
print(f"{'='*80}")
print("✓ All 8 models optimized")
print("✓ Parameters saved to JSON and CSV")
print("\nYou can now proceed to Part 2 for training and ensemble creation.")

# Load best parameters from Part 1
with open('../results/best_hyperparameters.json', 'r') as f:
    best_params_dict = json.load(f)

print("="*80)
print("PART 2: MODEL TRAINING & ENSEMBLE")
print("="*80)
print("✓ Best parameters loaded from results/best_hyperparameters.json")


SAVING BEST PARAMETERS
✓ Saved to: results/best_hyperparameters.json
✓ Saved to: best_hyperparameters.csv

Best Parameters Summary:

LIGHTGBM:
  learning_rate: 0.020763482134447155
  num_leaves: 49
  max_depth: 9
  min_child_samples: 33
  feature_fraction: 0.8698385129840963
  bagging_fraction: 0.7728284587275367
  bagging_freq: 4
  lambda_l1: 0.42754101835854963
  lambda_l2: 0.02541912674409519
  min_gain_to_split: 0.10789142699330445

XGBOOST:
  learning_rate: 0.025691100161474374
  max_depth: 4
  min_child_weight: 5
  subsample: 0.6750300724595839
  colsample_bytree: 0.7080589128703798
  gamma: 0.34569374524826657
  lambda: 1.4740680461408981
  alpha: 1.197143151579027

CATBOOST:
  iterations: 2617
  learning_rate: 0.037415239225603365
  depth: 4
  l2_leaf_reg: 7.158097238609412
  bagging_temperature: 0.4401524937396013
  random_strength: 0.24407646968955765
  border_count: 142

EXTRATREES:
  n_estimators: 142
  max_depth: 10
  min_samples_split: 13
  min_samples_leaf: 6
  max_feat

In [9]:
# =============================================================================
# 7. MULTI-LEVEL STACKING WITH OPTIMIZED MODELS
# =============================================================================
print("\n" + "="*80)
print("LEVEL 1: BASE MODELS TRAINING (8 MODELS)")
print("="*80)

N_FOLDS = 10
y_bins = pd.qcut(y, q=10, labels=False, duplicates='drop')
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Storage for OOF and test predictions
oof_predictions = {
    'ridge': np.zeros(len(y)),
    'elastic': np.zeros(len(y)),
    'xgb': np.zeros(len(y)),
    'lgb': np.zeros(len(y)),
    'cat': np.zeros(len(y)),
    'et': np.zeros(len(y)),
    'gbr': np.zeros(len(y)),
    'svr': np.zeros(len(y))
}

test_predictions = {
    'ridge': np.zeros((len(X_test_init), N_FOLDS)),
    'elastic': np.zeros((len(X_test_init), N_FOLDS)),
    'xgb': np.zeros((len(X_test_init), N_FOLDS)),
    'lgb': np.zeros((len(X_test_init), N_FOLDS)),
    'cat': np.zeros((len(X_test_init), N_FOLDS)),
    'et': np.zeros((len(X_test_init), N_FOLDS)),
    'gbr': np.zeros((len(X_test_init), N_FOLDS)),
    'svr': np.zeros((len(X_test_init), N_FOLDS))
}

best_iterations = {
    'xgb': [],
    'lgb': [],
    'cat': []
}

# Update parameters with fixed settings
best_params_dict['lightgbm'].update({
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'device': 'gpu' if USE_GPU else 'cpu',
    'gpu_use_dp': False if USE_GPU else None,
    'seed': 42
})

best_params_dict['xgboost'].update({
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda:0' if USE_GPU else 'cpu',
    'predictor': 'gpu_predictor' if USE_GPU else 'cpu_predictor',
    'seed': 42
})

best_params_dict['catboost'].update({
    'task_type': 'GPU' if USE_GPU else 'CPU',
    'devices': '0' if USE_GPU else None,
    'verbose': False,
    'random_seed': 42
})

# Remove None values
for model in ['lightgbm', 'xgboost', 'catboost']:
    best_params_dict[model] = {k: v for k, v in best_params_dict[model].items() if v is not None}

print(f"\nDataset info:")
print(f"  Train (synthetic): {len(X_train_init):,} samples × {X_train_init.shape[1]} features")
print(f"  Original data:     {len(X_orig_init):,} samples × {X_orig_init.shape[1]} features")
print(f"  Test:              {len(X_test_init):,} samples × {X_test_init.shape[1]} features")

# Training loop
for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_train_init, y_bins), 
                                                   total=N_FOLDS, 
                                                   desc="Training Folds"), 1):
    print(f"\n{'='*60}")
    print(f"FOLD {fold}/{N_FOLDS}")
    print(f"{'='*60}")

    # ✅ Split usando i dati già target-encodati
    X_tr, X_val = X_train_init.iloc[train_idx], X_train_init.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    # ✅ Augment with original data (già target-encodato)
    X_tr_full = pd.concat([X_tr, X_orig_init], axis=0, ignore_index=True)
    y_tr_full = np.concatenate([y_tr, y_orig])

    print(f"Training size: {len(X_tr_full):,} ({len(X_tr):,} synthetic + {len(X_orig_init):,} original)")

    # --- RIDGE ---
    print("Ridge...", end=" ", flush=True)
    scaler_ridge = RobustScaler()
    X_tr_scaled = scaler_ridge.fit_transform(X_tr_full)
    X_val_scaled = scaler_ridge.transform(X_val)
    X_test_scaled = scaler_ridge.transform(X_test_init)

    ridge = Ridge(**best_params_dict['ridge'], random_state=42)
    ridge.fit(X_tr_scaled, y_tr_full)
    oof_predictions['ridge'][val_idx] = np.clip(ridge.predict(X_val_scaled), 0, 100)
    test_predictions['ridge'][:, fold-1] = np.clip(ridge.predict(X_test_scaled), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['ridge'][val_idx])):.5f}")

    # --- ELASTICNET ---
    print("ElasticNet...", end=" ", flush=True)
    elastic = ElasticNet(**best_params_dict['elasticnet'], max_iter=5000, random_state=42)
    elastic.fit(X_tr_scaled, y_tr_full)
    oof_predictions['elastic'][val_idx] = np.clip(elastic.predict(X_val_scaled), 0, 100)
    test_predictions['elastic'][:, fold-1] = np.clip(elastic.predict(X_test_scaled), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['elastic'][val_idx])):.5f}")

    # --- XGBOOST ---
    print("XGBoost...", end=" ", flush=True)
    dtrain = xgb.DMatrix(X_tr_full, label=y_tr_full)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test_init)

    xgb_model = xgb.train(
        best_params_dict['xgboost'],
        dtrain,
        num_boost_round=3000,
        evals=[(dval, 'eval')],
        early_stopping_rounds=100,
        verbose_eval=False
    )

    best_iterations['xgb'].append(xgb_model.best_iteration)
    oof_predictions['xgb'][val_idx] = np.clip(xgb_model.predict(dval), 0, 100)
    test_predictions['xgb'][:, fold-1] = np.clip(xgb_model.predict(dtest), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['xgb'][val_idx])):.5f} (iter: {xgb_model.best_iteration})")

    # --- LIGHTGBM ---
    print("LightGBM...", end=" ", flush=True)
    train_data = lgb.Dataset(X_tr_full, label=y_tr_full)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    lgb_model = lgb.train(
        best_params_dict['lightgbm'],
        train_data,
        num_boost_round=3000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )

    best_iterations['lgb'].append(lgb_model.best_iteration)
    oof_predictions['lgb'][val_idx] = np.clip(lgb_model.predict(X_val), 0, 100)
    test_predictions['lgb'][:, fold-1] = np.clip(lgb_model.predict(X_test_init), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['lgb'][val_idx])):.5f} (iter: {lgb_model.best_iteration})")

    # --- CATBOOST ---
    print("CatBoost...", end=" ", flush=True)
    cat_model = CatBoostRegressor(**best_params_dict['catboost'])
    cat_model.fit(X_tr_full, y_tr_full, eval_set=(X_val, y_val), verbose=False)

    best_iterations['cat'].append(cat_model.get_best_iteration())
    oof_predictions['cat'][val_idx] = np.clip(cat_model.predict(X_val), 0, 100)
    test_predictions['cat'][:, fold-1] = np.clip(cat_model.predict(X_test_init), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['cat'][val_idx])):.5f} (iter: {cat_model.get_best_iteration()})")

    # --- EXTRATREES ---
    print("ExtraTrees...", end=" ", flush=True)
    et_model = ExtraTreesRegressor(**best_params_dict['extratrees'])
    et_model.fit(X_tr_full, y_tr_full)
    oof_predictions['et'][val_idx] = np.clip(et_model.predict(X_val), 0, 100)
    test_predictions['et'][:, fold-1] = np.clip(et_model.predict(X_test_init), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['et'][val_idx])):.5f}")

    # --- GRADIENT BOOSTING ---
    print("GradientBoosting...", end=" ", flush=True)
    gbr_model = GradientBoostingRegressor(**best_params_dict['gradientboosting'])
    gbr_model.fit(X_tr_full, y_tr_full)
    oof_predictions['gbr'][val_idx] = np.clip(gbr_model.predict(X_val), 0, 100)
    test_predictions['gbr'][:, fold-1] = np.clip(gbr_model.predict(X_test_init), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['gbr'][val_idx])):.5f}")

    # --- SVR (with subsampling for speed) ---
    print("SVR...", end=" ", flush=True)
    sample_size = min(100000, len(X_tr_full))
    if len(X_tr_full) > sample_size:
        idx = np.random.choice(len(X_tr_full), sample_size, replace=False)
        X_tr_sample = X_tr_full.iloc[idx]
        y_tr_sample = y_tr_full[idx]
    else:
        X_tr_sample = X_tr_full
        y_tr_sample = y_tr_full

    scaler_svr = StandardScaler()
    X_tr_svr = scaler_svr.fit_transform(X_tr_sample)
    X_val_svr = scaler_svr.transform(X_val)
    X_test_svr = scaler_svr.transform(X_test_init)

    svr_model = SVR(**best_params_dict['svr'], kernel='rbf')
    svr_model.fit(X_tr_svr, y_tr_sample)
    oof_predictions['svr'][val_idx] = np.clip(svr_model.predict(X_val_svr), 0, 100)
    test_predictions['svr'][:, fold-1] = np.clip(svr_model.predict(X_test_svr), 0, 100)
    print(f"RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions['svr'][val_idx])):.5f}")

# Level 1 OOF scores
print(f"\n{'='*80}")
print("LEVEL 1 OOF SCORES")
print(f"{'='*80}")
for model_name, oof in oof_predictions.items():
    rmse = np.sqrt(mean_squared_error(y, oof))
    print(f"{model_name.upper():15s}: {rmse:.6f}")


LEVEL 1: BASE MODELS TRAINING (8 MODELS)

Dataset info:
  Train (synthetic): 500 samples × 17 features
  Original data:     200 samples × 17 features
  Test:              100 samples × 17 features


Training Folds:   0%|          | 0/10 [00:00<?, ?it/s]


FOLD 1/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 8.58951
ElasticNet... RMSE: 8.54865
XGBoost... RMSE: 8.69551 (iter: 146)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[171]	valid_0's rmse: 8.86981
RMSE: 8.86981 (iter: 171)
CatBoost... RMSE: 8.28395 (iter: 201)
ExtraTrees... RMSE: 8.29571
GradientBoosting... RMSE: 8.51005
SVR... 

Training Folds:  10%|█         | 1/10 [00:02<00:20,  2.33s/it]

RMSE: 8.52794

FOLD 2/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 9.13556
ElasticNet... RMSE: 9.08940
XGBoost... RMSE: 10.05524 (iter: 171)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[166]	valid_0's rmse: 9.8636
RMSE: 9.86360 (iter: 166)
CatBoost... RMSE: 9.42505 (iter: 142)
ExtraTrees... RMSE: 9.84746
GradientBoosting... RMSE: 10.08082
SVR... 

Training Folds:  20%|██        | 2/10 [00:04<00:19,  2.38s/it]

RMSE: 9.40667

FOLD 3/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 8.99441
ElasticNet... RMSE: 8.99422
XGBoost... RMSE: 9.77154 (iter: 124)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[156]	valid_0's rmse: 9.47008
RMSE: 9.47008 (iter: 156)
CatBoost... RMSE: 9.03379 (iter: 190)
ExtraTrees... RMSE: 9.48427
GradientBoosting... RMSE: 9.36207
SVR... 

Training Folds:  30%|███       | 3/10 [00:07<00:16,  2.37s/it]

RMSE: 9.71736

FOLD 4/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 9.06966
ElasticNet... RMSE: 9.03164
XGBoost... RMSE: 9.56348 (iter: 197)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[239]	valid_0's rmse: 9.36093
RMSE: 9.36093 (iter: 239)
CatBoost... RMSE: 9.00241 (iter: 144)
ExtraTrees... RMSE: 9.32460
GradientBoosting... RMSE: 9.46273
SVR... 

Training Folds:  40%|████      | 4/10 [00:09<00:14,  2.36s/it]

RMSE: 9.84722

FOLD 5/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 8.93847
ElasticNet... RMSE: 8.98362
XGBoost... RMSE: 8.88608 (iter: 156)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[220]	valid_0's rmse: 8.90413
RMSE: 8.90413 (iter: 220)
CatBoost... RMSE: 8.80888 (iter: 264)
ExtraTrees... RMSE: 9.27934
GradientBoosting... RMSE: 8.49708
SVR... 

Training Folds:  50%|█████     | 5/10 [00:11<00:11,  2.35s/it]

RMSE: 8.39061

FOLD 6/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 7.77975
ElasticNet... RMSE: 7.86347
XGBoost... RMSE: 8.35736 (iter: 177)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[276]	valid_0's rmse: 7.965
RMSE: 7.96500 (iter: 276)
CatBoost... RMSE: 8.27575 (iter: 172)
ExtraTrees... RMSE: 8.85814
GradientBoosting... RMSE: 8.46885
SVR... 

Training Folds:  60%|██████    | 6/10 [00:14<00:09,  2.48s/it]

RMSE: 8.10540

FOLD 7/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 9.94333
ElasticNet... RMSE: 9.89971
XGBoost... RMSE: 9.65715 (iter: 124)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[124]	valid_0's rmse: 9.30108
RMSE: 9.30108 (iter: 124)
CatBoost... RMSE: 9.61642 (iter: 213)
ExtraTrees... RMSE: 9.84408
GradientBoosting... RMSE: 9.76726
SVR... 

Training Folds:  70%|███████   | 7/10 [00:17<00:08,  2.78s/it]

RMSE: 10.51907

FOLD 8/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 9.26583
ElasticNet... RMSE: 9.25004
XGBoost... RMSE: 9.63488 (iter: 129)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[178]	valid_0's rmse: 9.64325
RMSE: 9.64325 (iter: 178)
CatBoost... RMSE: 9.44695 (iter: 131)
ExtraTrees... RMSE: 9.25356
GradientBoosting... RMSE: 9.75777
SVR... 

Training Folds:  80%|████████  | 8/10 [00:20<00:05,  2.68s/it]

RMSE: 10.04189

FOLD 9/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 9.87988
ElasticNet... RMSE: 9.88644
XGBoost... RMSE: 10.47581 (iter: 250)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[284]	valid_0's rmse: 10.0702
RMSE: 10.07015 (iter: 284)
CatBoost... RMSE: 9.91978 (iter: 192)
ExtraTrees... RMSE: 10.72676
GradientBoosting... RMSE: 9.82333
SVR... 

Training Folds:  90%|█████████ | 9/10 [00:23<00:02,  2.77s/it]

RMSE: 9.91247

FOLD 10/10
Training size: 650 (450 synthetic + 200 original)
Ridge... RMSE: 8.24177
ElasticNet... RMSE: 8.23803
XGBoost... RMSE: 8.87170 (iter: 263)
LightGBM... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[336]	valid_0's rmse: 8.77091
RMSE: 8.77091 (iter: 336)
CatBoost... RMSE: 8.72947 (iter: 249)
ExtraTrees... RMSE: 9.38574
GradientBoosting... RMSE: 8.90065
SVR... 

Training Folds: 100%|██████████| 10/10 [00:25<00:00,  2.59s/it]

RMSE: 8.66836

LEVEL 1 OOF SCORES
RIDGE          : 9.006004
ELASTIC        : 8.999318
XGB            : 9.418131
LGB            : 9.240334
CAT            : 9.069280
ET             : 9.449621
GBR            : 9.281649
SVR            : 9.346624





In [10]:
# =============================================================================
# 8. FULL REFIT ON ALL DATA
# =============================================================================
print(f"\n{'='*80}")
print("FULL REFIT: TRAINING ON 100% OF DATA")
print(f"{'='*80}")

avg_best_iter_xgb = int(np.mean(best_iterations['xgb']))
avg_best_iter_lgb = int(np.mean(best_iterations['lgb']))
avg_best_iter_cat = int(np.mean(best_iterations['cat']))

print(f"\nAverage best iterations from CV:")
print(f"  XGBoost:  {avg_best_iter_xgb}")
print(f"  LightGBM: {avg_best_iter_lgb}")
print(f"  CatBoost: {avg_best_iter_cat}")

# ✅ Prepare full dataset (già target-encodato)
X_full = pd.concat([X_train_init, X_orig_init], axis=0, ignore_index=True)
y_full = np.concatenate([y, y_orig])

print(f"\nFull training set size: {len(X_full):,} samples")

# Storage for full model predictions
test_pred_full = {}

# Storage for full models
full_models = {}

print("\nTraining final models on 100% data...")

# Ridge
print("  Ridge...", end=" ", flush=True)
scaler_full = RobustScaler()
X_full_scaled = scaler_full.fit_transform(X_full)
X_test_full_scaled = scaler_full.transform(X_test_init)

ridge_full = Ridge(**best_params_dict['ridge'], random_state=42)
ridge_full.fit(X_full_scaled, y_full)
test_pred_full['ridge'] = np.clip(ridge_full.predict(X_test_full_scaled), 0, 100)
full_models['ridge'] = {'model': ridge_full, 'scaler': scaler_full}
print("✓")

# ElasticNet
print("  ElasticNet...", end=" ", flush=True)
elastic_full = ElasticNet(**best_params_dict['elasticnet'], max_iter=5000, random_state=42)
elastic_full.fit(X_full_scaled, y_full)
test_pred_full['elastic'] = np.clip(elastic_full.predict(X_test_full_scaled), 0, 100)
full_models['elastic'] = {'model': elastic_full, 'scaler': scaler_full}
print("✓")

# XGBoost
print("  XGBoost...", end=" ", flush=True)
dtrain_full = xgb.DMatrix(X_full, label=y_full)
dtest_full = xgb.DMatrix(X_test_init)
xgb_full = xgb.train(best_params_dict['xgboost'], dtrain_full, num_boost_round=avg_best_iter_xgb, verbose_eval=False)
test_pred_full['xgb'] = np.clip(xgb_full.predict(dtest_full), 0, 100)
full_models['xgb'] = xgb_full
print("✓")

# LightGBM
print("  LightGBM...", end=" ", flush=True)
train_full_lgb = lgb.Dataset(X_full, label=y_full)
lgb_full = lgb.train(best_params_dict['lightgbm'], train_full_lgb, num_boost_round=avg_best_iter_lgb, callbacks=[lgb.log_evaluation(0)])
test_pred_full['lgb'] = np.clip(lgb_full.predict(X_test_init), 0, 100)
full_models['lgb'] = lgb_full
print("✓")

# CatBoost
print("  CatBoost...", end=" ", flush=True)
cat_params_full = best_params_dict['catboost'].copy()
cat_params_full['iterations'] = avg_best_iter_cat
cat_full = CatBoostRegressor(**cat_params_full)
cat_full.fit(X_full, y_full)
test_pred_full['cat'] = np.clip(cat_full.predict(X_test_init), 0, 100)
full_models['cat'] = cat_full
print("✓")

# ExtraTrees
print("  ExtraTrees...", end=" ", flush=True)
et_full = ExtraTreesRegressor(**best_params_dict['extratrees'])
et_full.fit(X_full, y_full)
test_pred_full['et'] = np.clip(et_full.predict(X_test_init), 0, 100)
full_models['et'] = et_full
print("✓")

# GradientBoosting
print("  GradientBoosting...", end=" ", flush=True)
gbr_full = GradientBoostingRegressor(**best_params_dict['gradientboosting'])
gbr_full.fit(X_full, y_full)
test_pred_full['gbr'] = np.clip(gbr_full.predict(X_test_init), 0, 100)
full_models['gbr'] = gbr_full
print("✓")

# SVR
print("  SVR...", end=" ", flush=True)
sample_size = min(100000, len(X_full))
if len(X_full) > sample_size:
    idx = np.random.choice(len(X_full), sample_size, replace=False)
    X_full_sample = X_full.iloc[idx]
    y_full_sample = y_full[idx]
else:
    X_full_sample = X_full
    y_full_sample = y_full

scaler_svr_full = StandardScaler()
X_full_svr = scaler_svr_full.fit_transform(X_full_sample)
X_test_svr = scaler_svr_full.transform(X_test_init)

svr_full = SVR(**best_params_dict['svr'], kernel='rbf')
svr_full.fit(X_full_svr, y_full_sample)
test_pred_full['svr'] = np.clip(svr_full.predict(X_test_svr), 0, 100)
full_models['svr'] = {'model': svr_full, 'scaler': scaler_svr_full}
print("✓")

print("\n✓ All models trained on full data!")

# =============================================================================
# 💾 SAVE FULL MODELS
# =============================================================================
print(f"\n{'='*80}")
print("SAVING FULL MODELS")
print(f"{'='*80}")

import os
import joblib

# Create directory for models
os.makedirs('../models', exist_ok=True)

print("\nSaving models...")
for model_name, model_obj in full_models.items():
    filepath = f'../models/{model_name}_full.pkl'
    joblib.dump(model_obj, filepath)
    print(f"  ✓ {model_name:15s} → {filepath}")

print("\n✓ All models saved successfully in '../models/' directory!")


FULL REFIT: TRAINING ON 100% OF DATA

Average best iterations from CV:
  XGBoost:  173
  LightGBM: 215
  CatBoost: 189

Full training set size: 700 samples

Training final models on 100% data...
  Ridge... ✓
  ElasticNet... ✓
  XGBoost... ✓
  LightGBM... ✓
  CatBoost... ✓
  ExtraTrees... ✓
  GradientBoosting... ✓
  SVR... ✓

✓ All models trained on full data!

SAVING FULL MODELS

Saving models...
  ✓ ridge           → ../models/ridge_full.pkl
  ✓ elastic         → ../models/elastic_full.pkl
  ✓ xgb             → ../models/xgb_full.pkl
  ✓ lgb             → ../models/lgb_full.pkl
  ✓ cat             → ../models/cat_full.pkl
  ✓ et              → ../models/et_full.pkl
  ✓ gbr             → ../models/gbr_full.pkl
  ✓ svr             → ../models/svr_full.pkl

✓ All models saved successfully in '../models/' directory!


In [11]:
# =============================================================================
# 9. LEVEL 2: META-LEARNER STACKING
# =============================================================================
print(f"\n{'='*80}")
print("LEVEL 2: META-LEARNER STACKING")
print(f"{'='*80}")

# Create meta-features
meta_train = np.column_stack(list(oof_predictions.values()))
meta_test = np.column_stack(list(test_pred_full.values()))

# Add statistical features
meta_train_enhanced = np.column_stack([
    meta_train,
    meta_train.mean(axis=1),
    meta_train.std(axis=1),
    meta_train.max(axis=1),
    meta_train.min(axis=1)
])

meta_test_enhanced = np.column_stack([
    meta_test,
    meta_test.mean(axis=1),
    meta_test.std(axis=1),
    meta_test.max(axis=1),
    meta_test.min(axis=1)
])

print(f"Meta-features shape: {meta_train_enhanced.shape}")

# Train meta-learners
oof_meta_ridge = np.zeros(len(y))
oof_meta_lgb = np.zeros(len(y))
oof_meta_xgb = np.zeros(len(y))

test_meta_ridge = np.zeros((len(test_fe), N_FOLDS))
test_meta_lgb = np.zeros((len(test_fe), N_FOLDS))
test_meta_xgb = np.zeros((len(test_fe), N_FOLDS))

# Storage for meta-learner models
meta_models = {
    'ridge': [],
    'lgb': [],
    'xgb': [],
    'scalers': []
}

for fold, (train_idx, val_idx) in enumerate(skf.split(meta_train_enhanced, y_bins), 1):
    print(f"Meta-learner Fold {fold}/{N_FOLDS}...", end=" ")

    X_tr_meta, X_val_meta = meta_train_enhanced[train_idx], meta_train_enhanced[val_idx]
    y_tr_meta, y_val_meta = y[train_idx], y[val_idx]

    # Ridge meta
    scaler_meta = StandardScaler()
    X_tr_meta_scaled = scaler_meta.fit_transform(X_tr_meta)
    X_val_meta_scaled = scaler_meta.transform(X_val_meta)
    X_test_meta_scaled = scaler_meta.transform(meta_test_enhanced)

    ridge_meta = Ridge(alpha=1.0, random_state=42)
    ridge_meta.fit(X_tr_meta_scaled, y_tr_meta)
    oof_meta_ridge[val_idx] = np.clip(ridge_meta.predict(X_val_meta_scaled), 0, 100)
    test_meta_ridge[:, fold-1] = np.clip(ridge_meta.predict(X_test_meta_scaled), 0, 100)
    
    # Save Ridge meta model and scaler
    meta_models['ridge'].append(ridge_meta)
    meta_models['scalers'].append(scaler_meta)

    # LightGBM meta
    meta_lgb_params = {
        'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01,
        'num_leaves': 15, 'max_depth': 4, 'feature_fraction': 0.8,
        'bagging_fraction': 0.8, 'bagging_freq': 5, 'lambda_l1': 1.0,
        'lambda_l2': 1.0, 'verbosity': -1, 'seed': 42
    }

    train_meta_data = lgb.Dataset(X_tr_meta, label=y_tr_meta)
    val_meta_data = lgb.Dataset(X_val_meta, label=y_val_meta, reference=train_meta_data)

    lgb_meta = lgb.train(
        meta_lgb_params, train_meta_data, num_boost_round=1000,
        valid_sets=[val_meta_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    oof_meta_lgb[val_idx] = np.clip(lgb_meta.predict(X_val_meta), 0, 100)
    test_meta_lgb[:, fold-1] = np.clip(lgb_meta.predict(meta_test_enhanced), 0, 100)
    
    # Save LightGBM meta model
    meta_models['lgb'].append(lgb_meta)

    # XGBoost meta
    dtrain_meta = xgb.DMatrix(X_tr_meta, label=y_tr_meta)
    dval_meta = xgb.DMatrix(X_val_meta, label=y_val_meta)
    dtest_meta = xgb.DMatrix(meta_test_enhanced)

    meta_xgb_params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'learning_rate': 0.01, 'max_depth': 3, 'subsample': 0.8,
        'colsample_bytree': 0.8, 'lambda': 2.0, 'alpha': 1.0, 'seed': 42
    }

    xgb_meta = xgb.train(
        meta_xgb_params, dtrain_meta, num_boost_round=1000,
        evals=[(dval_meta, 'eval')],
        early_stopping_rounds=50, verbose_eval=False
    )
    oof_meta_xgb[val_idx] = np.clip(xgb_meta.predict(dval_meta), 0, 100)
    test_meta_xgb[:, fold-1] = np.clip(xgb_meta.predict(dtest_meta), 0, 100)
    
    # Save XGBoost meta model
    meta_models['xgb'].append(xgb_meta)

    print(f"✓")

print(f"\n{'='*80}")
print("LEVEL 2 META-LEARNER OOF SCORES")
print(f"{'='*80}")
print(f"Ridge Meta:    {np.sqrt(mean_squared_error(y, oof_meta_ridge)):.6f}")
print(f"LightGBM Meta: {np.sqrt(mean_squared_error(y, oof_meta_lgb)):.6f}")
print(f"XGBoost Meta:  {np.sqrt(mean_squared_error(y, oof_meta_xgb)):.6f}")

# =============================================================================
# 💾 SAVE META-LEARNER MODELS
# =============================================================================
print(f"\n{'='*80}")
print("SAVING META-LEARNER MODELS")
print(f"{'='*80}")

os.makedirs('../models', exist_ok=True)

# Save all meta-learner fold models
joblib.dump(meta_models, '../models/meta_models.pkl')
print(f"  ✓ All meta-learner models saved → models/meta_models.pkl")

# Save meta-learner parameters for reference
meta_info = {
    'ridge_params': {'alpha': 1.0, 'random_state': 42},
    'lgb_params': meta_lgb_params,
    'xgb_params': meta_xgb_params,
    'n_folds': N_FOLDS,
    'meta_feature_names': list(oof_predictions.keys()) + ['mean', 'std', 'max', 'min']
}
joblib.dump(meta_info, '../models/meta_info.pkl')
print(f"  ✓ Meta-learner info saved → models/meta_info.pkl")

print("\n✓ All meta-learner models saved successfully!")


LEVEL 2: META-LEARNER STACKING
Meta-features shape: (500, 12)
Meta-learner Fold 1/10... Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[315]	valid_0's rmse: 8.55687
✓
Meta-learner Fold 2/10... Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[302]	valid_0's rmse: 9.13258
✓
Meta-learner Fold 3/10... Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[285]	valid_0's rmse: 9.11603
✓
Meta-learner Fold 4/10... Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[460]	valid_0's rmse: 9.82911
✓
Meta-learner Fold 5/10... Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[384]	valid_0's rmse: 9.00089
✓
Meta-learner Fold 6/10... Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[335]	valid_0's rmse: 8.10021
✓
Meta-learner 

In [12]:
# =============================================================================
# 10. OPTIMAL ENSEMBLE
# =============================================================================
print(f"\n{'='*80}")
print("OPTIMIZING FINAL ENSEMBLE")
print(f"{'='*80}")

def ensemble_rmse_final(weights, *args):
    oof_preds, y_true = args
    weights = np.abs(weights) / np.sum(np.abs(weights))
    ensemble = sum(w * pred for w, pred in zip(weights, oof_preds))
    ensemble = np.clip(ensemble, 0, 100)
    return np.sqrt(mean_squared_error(y_true, ensemble))

# Combine all predictions
all_oof_preds = list(oof_predictions.values()) + [
    oof_meta_ridge, oof_meta_lgb, oof_meta_xgb
]

all_test_preds = list(test_pred_full.values()) + [
    test_meta_ridge.mean(axis=1),
    test_meta_lgb.mean(axis=1),
    test_meta_xgb.mean(axis=1)
]

# Optimize weights
initial_weights = np.ones(len(all_oof_preds)) / len(all_oof_preds)

result = minimize(
    ensemble_rmse_final, initial_weights, args=(all_oof_preds, y),
    method='Nelder-Mead',
    options={'maxiter': 2000, 'xatol': 1e-8, 'fatol': 1e-8}
)

optimal_weights = np.abs(result.x) / np.sum(np.abs(result.x))

print("\nOptimal weights:")
model_names = list(oof_predictions.keys()) + ['ridge_meta', 'lgb_meta', 'xgb_meta']
for name, weight in zip(model_names, optimal_weights):
    if weight > 0.01:
        print(f"  {name:15s}: {weight:.4f}")

# Create final ensemble
final_oof = sum(w * pred for w, pred in zip(optimal_weights, all_oof_preds))
final_oof = np.clip(final_oof, 0, 100)

final_test = sum(w * pred for w, pred in zip(optimal_weights, all_test_preds))
final_test = np.clip(final_test, 0, 100)

final_oof_rmse = np.sqrt(mean_squared_error(y, final_oof))

print(f"\n{'='*80}")
print("FINAL ENSEMBLE PERFORMANCE")
print(f"{'='*80}")
print(f"Final OOF RMSE: {final_oof_rmse:.6f}")

best_single = min([np.sqrt(mean_squared_error(y, pred)) for pred in all_oof_preds])
print(f"Best single model: {best_single:.6f}")
print(f"Improvement: {best_single - final_oof_rmse:.6f}")

# =============================================================================
# 💾 SAVE ENSEMBLE CONFIGURATION
# =============================================================================
print(f"\n{'='*80}")
print("SAVING ENSEMBLE CONFIGURATION")
print(f"{'='*80}")

os.makedirs('../models', exist_ok=True)

# Prepare ensemble configuration
ensemble_config = {
    'weights': optimal_weights,
    'model_names': model_names,
    'weights_dict': {name: weight for name, weight in zip(model_names, optimal_weights)},
    'performance': {
        'final_oof_rmse': final_oof_rmse,
        'best_single_model_rmse': best_single,
        'improvement': best_single - final_oof_rmse
    },
    'optimization_result': {
        'success': result.success,
        'message': result.message,
        'n_iterations': result.nit if hasattr(result, 'nit') else None,
        'final_value': result.fun
    },
    'model_order': {
        'base_models': list(oof_predictions.keys()),
        'meta_models': ['ridge_meta', 'lgb_meta', 'xgb_meta']
    }
}

# Save ensemble configuration
joblib.dump(ensemble_config, '../models/ensemble_config.pkl')
print(f"  ✓ Ensemble configuration saved → models/ensemble_config.pkl")

# Save final predictions for validation
ensemble_predictions = {
    'oof_predictions': final_oof,
    'test_predictions': final_test,
    'y_true': y
}
joblib.dump(ensemble_predictions, '../models/ensemble_predictions.pkl')
print(f"  ✓ Ensemble predictions saved → models/ensemble_predictions.pkl")

# Create a summary report
summary = {
    'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'n_base_models': len(oof_predictions),
    'n_meta_models': 3,
    'total_models': len(model_names),
    'final_oof_rmse': final_oof_rmse,
    'best_single_rmse': best_single,
    'improvement': best_single - final_oof_rmse,
    'top_5_models': sorted(
        [(name, weight) for name, weight in zip(model_names, optimal_weights)],
        key=lambda x: x[1],
        reverse=True
    )[:5]
}
joblib.dump(summary, '../models/ensemble_summary.pkl')
print(f"  ✓ Ensemble summary saved → models/ensemble_summary.pkl")

print("\n✓ All ensemble configurations saved successfully!")

# Print summary of what was saved
print(f"\n{'='*80}")
print("SAVED FILES SUMMARY")
print(f"{'='*80}")
print("\nBase models (8 models):")
for name in oof_predictions.keys():
    print(f"  ✓ models/{name}_full.pkl")

print("\nMeta-learner models:")
print(f"  ✓ models/meta_models.pkl (Ridge, LightGBM, XGBoost × {N_FOLDS} folds)")
print(f"  ✓ models/meta_info.pkl")

print("\nEnsemble configuration:")
print(f"  ✓ models/ensemble_config.pkl")
print(f"  ✓ models/ensemble_predictions.pkl")
print(f"  ✓ models/ensemble_summary.pkl")

print("\nParameters and metadata:")
print(f"  ✓ models/best_params.pkl")
print(f"  ✓ models/best_iterations.pkl")

print(f"\n{'='*80}")
print("ALL MODELS AND CONFIGURATIONS SAVED!")
print(f"{'='*80}")


OPTIMIZING FINAL ENSEMBLE

Optimal weights:
  ridge          : 0.2407
  elastic        : 0.2249
  lgb            : 0.0340
  cat            : 0.3410
  svr            : 0.1125
  xgb_meta       : 0.0413

FINAL ENSEMBLE PERFORMANCE
Final OOF RMSE: 8.934629
Best single model: 8.999318
Improvement: 0.064689

SAVING ENSEMBLE CONFIGURATION
  ✓ Ensemble configuration saved → models/ensemble_config.pkl
  ✓ Ensemble predictions saved → models/ensemble_predictions.pkl
  ✓ Ensemble summary saved → models/ensemble_summary.pkl

✓ All ensemble configurations saved successfully!

SAVED FILES SUMMARY

Base models (8 models):
  ✓ models/ridge_full.pkl
  ✓ models/elastic_full.pkl
  ✓ models/xgb_full.pkl
  ✓ models/lgb_full.pkl
  ✓ models/cat_full.pkl
  ✓ models/et_full.pkl
  ✓ models/gbr_full.pkl
  ✓ models/svr_full.pkl

Meta-learner models:
  ✓ models/meta_models.pkl (Ridge, LightGBM, XGBoost × 10 folds)
  ✓ models/meta_info.pkl

Ensemble configuration:
  ✓ models/ensemble_config.pkl
  ✓ models/ensemble

In [13]:
# =============================================================================
# 11. POST-PROCESSING & CALIBRATION
# =============================================================================
print(f"\n{'='*80}")
print("POST-PROCESSING & CALIBRATION")
print(f"{'='*80}")

def calibrate_predictions(train_preds, train_true, test_preds):
    iso_reg = IsotonicRegression(out_of_bounds='clip')
    iso_reg.fit(train_preds, train_true)
    test_calibrated = iso_reg.predict(test_preds)
    return np.clip(test_calibrated, 0, 100)

# Apply calibration
final_test_calibrated = calibrate_predictions(final_oof, y, final_test)

# Check if calibration improves
final_oof_calibrated = calibrate_predictions(
    final_oof[::2], y[::2],
    final_oof[1::2]
)
calibrated_oof = final_oof.copy()
calibrated_oof[1::2] = final_oof_calibrated

calibrated_rmse = np.sqrt(mean_squared_error(y, calibrated_oof))
print(f"Calibrated OOF RMSE: {calibrated_rmse:.6f}")

if calibrated_rmse < final_oof_rmse:
    print("✓ Using calibrated predictions")
    final_submission = final_test_calibrated
    final_rmse = calibrated_rmse
else:
    print("✓ Using original predictions")
    final_submission = final_test
    final_rmse = final_oof_rmse


POST-PROCESSING & CALIBRATION
Calibrated OOF RMSE: 9.262065
✓ Using original predictions


In [14]:
# =============================================================================
# 12. FINAL STATISTICS & SUBMISSION
# =============================================================================
print(f"\n{'='*80}")
print("FINAL STATISTICS")
print(f"{'='*80}")

print(f"\nPrediction Statistics:")
print(f"  Train target - Mean: {y.mean():.2f}, Std: {y.std():.2f}, Min: {y.min():.2f}, Max: {y.max():.2f}")
print(f"  OOF preds    - Mean: {final_oof.mean():.2f}, Std: {final_oof.std():.2f}, Min: {final_oof.min():.2f}, Max: {final_oof.max():.2f}")
print(f"  Test preds   - Mean: {final_submission.mean():.2f}, Std: {final_submission.std():.2f}, Min: {final_submission.min():.2f}, Max: {final_submission.max():.2f}")

print(f"\n{'='*80}")
print("FINAL PERFORMANCE")
print(f"{'='*80}")
print(f"✓ Final OOF RMSE: {final_rmse:.6f}")
print(f"✓ Expected LB Score: ~{final_rmse:.4f} (±0.002)")
print(f"\n🎯 Target beaten: {final_rmse < 8.54414}")

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'exam_score': final_submission
})

submission.to_csv('../results/submission.csv', index=False)
print(f"\n✓ Submission saved to submission.csv")

# Save OOF predictions
oof_df = pd.DataFrame({
    'id': train_df['id'],
    'exam_score': y,
    'prediction': final_oof
})
oof_df.to_csv('../results/oof_predictions.csv', index=False)
print("✓ OOF predictions saved to oof_predictions.csv")

# Save individual model OOF predictions for analysis
oof_detailed = pd.DataFrame({
    'id': train_df['id'],
    'exam_score': y
})
for model_name, oof in oof_predictions.items():
    oof_detailed[f'oof_{model_name}'] = oof

oof_detailed['oof_ridge_meta'] = oof_meta_ridge
oof_detailed['oof_lgb_meta'] = oof_meta_lgb
oof_detailed['oof_xgb_meta'] = oof_meta_xgb
oof_detailed['oof_final_ensemble'] = final_oof

oof_detailed.to_csv('../results/oof_predictions_detailed.csv', index=False)
print("✓ Detailed OOF predictions saved to oof_predictions_detailed.csv")

# Save model comparison
model_comparison = pd.DataFrame({
    'model': model_names,
    'oof_rmse': [np.sqrt(mean_squared_error(y, pred)) for pred in all_oof_preds],
    'ensemble_weight': optimal_weights
}).sort_values('oof_rmse')

model_comparison.to_csv('../results/model_comparison.csv', index=False)
print("✓ Model comparison saved to model_comparison.csv")
print(f"\nFinal RMSE: {final_rmse:.6f}")
print("\n" + "="*80)


FINAL STATISTICS

Prediction Statistics:
  Train target - Mean: 63.27, Std: 18.56, Min: 19.60, Max: 100.00
  OOF preds    - Mean: 63.36, Std: 16.19, Min: 23.69, Max: 97.39
  Test preds   - Mean: 64.61, Std: 16.92, Min: 32.12, Max: 95.89

FINAL PERFORMANCE
✓ Final OOF RMSE: 8.934629
✓ Expected LB Score: ~8.9346 (±0.002)

🎯 Target beaten: False

✓ Submission saved to submission.csv
✓ OOF predictions saved to oof_predictions.csv
✓ Detailed OOF predictions saved to oof_predictions_detailed.csv
✓ Model comparison saved to model_comparison.csv

Final RMSE: 8.934629

