In [11]:
# %%
import pandas as pd
import numpy as np

# Load train and test data
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
df_train.head()


Train shape: (18153, 21)
Test shape: (7780, 20)


Unnamed: 0,ID,ASI_category,Temperature,Precipitation,Rainfall,Snowfall,Soil_Temperature,Radiation,Wind_Speed,Wind_Gusts,...,Surface_Pressure,Relative_Humidity,Soil_Moisture,Dew_Point,Sunshine_Duration,Cloud_Cover,Precipitation_Hours,Wind_Direction,Weather_Code,Daylight_Duration
0,19554,Moderate,0.931231,0.000912,0.000912,0.0,0.757673,0.879671,0.179293,0.193029,...,0.538056,55,0.546243,17.564597,53252.08,12.136192,1,176.459082,51,58772.52
1,25205,Moderate,0.566323,0.096715,0.096715,0.0,0.291448,0.008913,0.588384,0.532172,...,0.568475,88,0.557803,5.692134,0.0,91.901341,16,232.433005,61,28143.12
2,771,Poor,0.018033,0.0,0.0,0.0,0.0,0.27734,0.247475,0.189008,...,0.70652,78,0.791908,-25.26442,30213.79,18.85967,0,44.6886,3,34621.43
3,1976,Good,0.717541,0.0,0.0,0.0,0.635669,0.796709,0.123737,0.134048,...,0.5475,57,0.473988,5.913865,44627.21,38.759757,0,333.640418,3,59192.17
4,14036,Moderate,0.82717,0.001825,0.001825,0.0,0.743855,0.781282,0.343434,0.391421,...,0.546378,50,0.459538,9.661455,45267.17,60.058955,1,86.996954,51,59956.03


In [12]:
# %%
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_train['ASI_category_encoded'] = le.fit_transform(df_train['ASI_category'])


In [13]:
import pandas as pd
import numpy as np

def feature_engineering(df):
    """
    Combined feature engineering function:
    - Includes both base interaction features and advanced meteorological features.
    - Ensures safe numerical operations and consistent encodings.
    """

    df = df.copy()
    df.columns = df.columns.str.replace(' ', '_').str.replace('.', '', regex=False)
    epsilon = 1e-6  # To avoid division by zero

    # --- BASE FEATURES (Your original ones) ---
    df['Temp_Range_Impact'] = df['Temperature'] * df['Soil_Temperature']
    df['Humidity_Moisture'] = df['Relative_Humidity'] * df['Soil_Moisture']
    df['Effective_Radiation'] = df['Radiation'] * (df['Sunshine_Duration'] / (df['Daylight_Duration'] + epsilon))
    df['Total_Precip'] = df['Rainfall'] + df['Snowfall'] + df['Precipitation']
    df['Wind_Intensity'] = df['Wind_Speed'] * df['Wind_Gusts']
    df['Pressure_Humidity_Interaction'] = df['Surface_Pressure'] * df['Relative_Humidity']

    # --- EXTENDED INTERACTIONS ---
    df['Temp_Diff_Air_Soil'] = df['Temperature'] - df['Soil_Temperature']
    df['Temp_Mean'] = (df['Temperature'] + df['Soil_Temperature']) / 2
    df['Temp_Humidity_Index'] = df['Temperature'] * df['Relative_Humidity']
    df['Radiation_Per_Hour'] = df['Radiation'] / (df['Sunshine_Duration'] + epsilon)
    df['Sunshine_Ratio'] = df['Sunshine_Duration'] / (df['Daylight_Duration'] + epsilon)
    df['Wind_Stress'] = df['Wind_Speed'] ** 2
    df['Wind_Ratio'] = df['Wind_Gusts'] / (df['Wind_Speed'] + epsilon)
    df['Humidity_to_Pressure'] = df['Relative_Humidity'] / (df['Surface_Pressure'] + epsilon)
    df['Radiation_to_Temp'] = df['Radiation'] / (df['Temperature'] + epsilon)

    # --- ADVANCED METEOROLOGICAL FEATURES ---
    df['Temperature_Delta'] = df['Temperature'] - df['Dew_Point']
    df['Soil_Temp_Air_Temp_Ratio'] = np.where(
        df['Temperature'].abs() > epsilon,
        df['Soil_Temperature'] / df['Temperature'],
        0
    )
    df['Rainfall_Intensity'] = df['Rainfall'] / (df['Precipitation_Hours'] + epsilon)
    df['Pressure_MSL_Difference'] = df['Pressure_MSL'] - df['Surface_Pressure']
    df['Wind_Gust_Ratio'] = np.where(
        df['Wind_Speed'].abs() > epsilon,
        df['Wind_Gusts'] / df['Wind_Speed'],
        1.0
    )
    df['Radiation_Efficiency'] = np.where(
        df['Radiation'].abs() > epsilon,
        df['Sunshine_Duration'] / df['Radiation'],
        0
    )
    df['Precipitation_Type_Snow'] = np.where(df['Snowfall'] > 0, 1, 0)
    df['Precipitation_Ratio_Rain'] = np.where(
        df['Precipitation'].abs() > epsilon,
        df['Rainfall'] / df['Precipitation'],
        0
    )
    df['Soil_Moisture_RH_Interaction'] = df['Soil_Moisture'] * df['Relative_Humidity']
    df['Cloud_Rain_Interaction'] = df['Cloud_Cover'] * df['Rainfall']

    # --- Stability and Convection ---
    df['Temperature_Inversion_Delta'] = np.abs(df['Soil_Temperature'] - df['Temperature'])
    df['Humidity_Saturated_Index'] = df['Relative_Humidity'] * np.where(
        df['Temperature'].abs() > epsilon,
        df['Dew_Point'] / df['Temperature'],
        0
    )

    # --- Wind Shear / Turbulence ---
    df['Wind_Force_Index'] = df['Wind_Speed'] ** 2
    df['Gust_Delta_Ratio'] = np.where(
        df['Wind_Speed'].abs() > epsilon,
        (df['Wind_Gusts'] - df['Wind_Speed']) / df['Wind_Speed'],
        0
    )

    # --- Daylight & Cloud ---
    df['Daylight_Efficiency'] = np.where(
        df['Daylight_Duration'].abs() > epsilon,
        df['Sunshine_Duration'] / df['Daylight_Duration'],
        0
    )
    df['Cloud_Cover_Inverse'] = 100 - df['Cloud_Cover']

    # --- Cyclic Wind Direction ---
    if 'Wind_Direction' in df.columns:
        wind_rad = np.deg2rad(df['Wind_Direction'])
        df['Wind_Direction_Sin'] = np.sin(wind_rad)
        df['Wind_Direction_Cos'] = np.cos(wind_rad)

    # --- Pressure Trend by ID (conceptual time component) ---
    if 'ID' in df.columns:
        df = df.sort_values(by=['ID'])
        df['Pressure_MSL_Trend'] = df.groupby('ID')['Pressure_MSL'].diff().fillna(0)

    # --- Clean NaNs only for numeric columns ---
    df = df.replace([np.inf, -np.inf], np.nan)
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # --- Drop duplicate columns ---
    df = df.loc[:, ~df.columns.duplicated()]

    return df

df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)


In [14]:
# %%
from sklearn.preprocessing import StandardScaler

num_cols = df_train.select_dtypes(include=[np.number]).columns.drop('ASI_category_encoded')

scaler = StandardScaler()
df_train[num_cols] = scaler.fit_transform(df_train[num_cols])
df_test[num_cols] = scaler.transform(df_test[num_cols])


In [15]:
# %%
from xgboost import XGBClassifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# --- Define features and target ---
X = df_train.drop(['ASI_category', 'ASI_category_encoded'], axis=1)
y = df_train['ASI_category_encoded']

# --- Train XGBoost model for feature importance ---
xgb_model = XGBClassifier(
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=9,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    n_jobs=-1
)

xgb_model.fit(X, y)

# --- Extract feature importances ---
importances = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# --- Display Top Features ---
print("Most Important Features:")
print(feature_importance_df)

# # --- Plot Feature Importance ---
# plt.figure(figsize=(10, 8))
# plt.barh(feature_importance_df["Feature"][:20][::-1], feature_importance_df["Importance"][:20][::-1])
# plt.xlabel("Importance Score")
# plt.ylabel("Feature")
# plt.title("Top 20 Feature Importances (XGBoost)")
# plt.tight_layout()
# plt.show()


Most Important Features:
                          Feature  Importance
27                      Temp_Mean    0.222537
20              Temp_Range_Impact    0.101560
22            Effective_Radiation    0.093026
1                     Temperature    0.054316
34              Radiation_to_Temp    0.047103
6                       Radiation    0.042572
14              Sunshine_Duration    0.032083
5                Soil_Temperature    0.020690
13                      Dew_Point    0.015868
46       Humidity_Saturated_Index    0.015154
35              Temperature_Delta    0.015104
19              Daylight_Duration    0.014875
26             Temp_Diff_Air_Soil    0.014764
30                 Sunshine_Ratio    0.013029
49            Daylight_Efficiency    0.012452
41        Precipitation_Type_Snow    0.011208
45    Temperature_Inversion_Delta    0.010265
36       Soil_Temp_Air_Temp_Ratio    0.009613
2                   Precipitation    0.009514
43   Soil_Moisture_RH_Interaction    0.009478
12       

In [17]:
# --- Compute cumulative importance ---
feature_importance_df = feature_importance_df.reset_index(drop=True)
feature_importance_df["Cumulative_Importance"] = feature_importance_df["Importance"].cumsum()

# --- Select features covering 97% of importance ---
selected_features = feature_importance_df[feature_importance_df["Cumulative_Importance"] <= 0.97]["Feature"].tolist()

# --- Identify dropped features ---
dropped_features = feature_importance_df[feature_importance_df["Cumulative_Importance"] > 0.97]["Feature"].tolist()

print(f"‚úÖ Selected {len(selected_features)} features (covering 97% importance):")
print(selected_features)

print(f"\nüóëÔ∏è Dropped {len(dropped_features)} less important features:")
print(dropped_features)


‚úÖ Selected 47 features (covering 97% importance):
['Temp_Mean', 'Temp_Range_Impact', 'Effective_Radiation', 'Temperature', 'Radiation_to_Temp', 'Radiation', 'Sunshine_Duration', 'Soil_Temperature', 'Dew_Point', 'Humidity_Saturated_Index', 'Temperature_Delta', 'Daylight_Duration', 'Temp_Diff_Air_Soil', 'Sunshine_Ratio', 'Daylight_Efficiency', 'Precipitation_Type_Snow', 'Temperature_Inversion_Delta', 'Soil_Temp_Air_Temp_Ratio', 'Precipitation', 'Soil_Moisture_RH_Interaction', 'Soil_Moisture', 'Radiation_Efficiency', 'Cloud_Rain_Interaction', 'Humidity_Moisture', 'Rainfall', 'Relative_Humidity', 'Radiation_Per_Hour', 'Snowfall', 'Temp_Humidity_Index', 'Rainfall_Intensity', 'Precipitation_Ratio_Rain', 'Precipitation_Hours', 'Cloud_Cover_Inverse', 'Pressure_Humidity_Interaction', 'Pressure_MSL', 'Wind_Speed', 'Humidity_to_Pressure', 'Cloud_Cover', 'Weather_Code', 'ID', 'Wind_Gust_Ratio', 'Surface_Pressure', 'Wind_Gusts', 'Wind_Ratio', 'Wind_Direction_Cos', 'Wind_Direction_Sin', 'Wind_Forc

In [32]:
from sklearn.model_selection import train_test_split

# --- Use selected features from the previous step ---
X = df_train[selected_features]
y = df_train["ASI_category_encoded"]

# --- Split the data ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.1,          # 20% for validation
    stratify=y,             # preserve class balance
    random_state=42
)

print("‚úÖ Data Split Complete:")
print(f"Training Shape   : {X_train.shape}")
print(f"Validation Shape : {X_val.shape}")
print(f"Target Distribution (Train):")
print(y_train.value_counts(normalize=True))


‚úÖ Data Split Complete:
Training Shape   : (16337, 47)
Validation Shape : (1816, 47)
Target Distribution (Train):
ASI_category_encoded
1    0.701108
0    0.172859
2    0.126033
Name: proportion, dtype: float64


In [37]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, classification_report

# --- Define model ---
xgb_es = XGBClassifier(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.75,
    colsample_bytree=0.7,
    gamma=1.0,
    min_child_weight=4,
    reg_alpha=1.0,
    reg_lambda=3.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="mlogloss",
    use_label_encoder=False
)

# --- Train with early stopping (new syntax for XGBoost ‚â•2.0) ---
xgb_es.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=100,
    early_stopping_rounds=50  # ‚úÖ still works for backward compatibility
)

# --- Best iteration ---
best_iter = getattr(xgb_es, "best_iteration", None)
print(f"\n‚úÖ Best iteration (n_estimators used): {best_iter}")

# --- Evaluate ---
train_preds = xgb_es.predict(X_train)
val_preds = xgb_es.predict(X_val)
train_probs = xgb_es.predict_proba(X_train)
val_probs = xgb_es.predict_proba(X_val)

train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, val_preds)
train_f1 = f1_score(y_train, train_preds, average='macro')
val_f1 = f1_score(y_val, val_preds, average='macro')
train_logloss = log_loss(y_train, train_probs)
val_logloss = log_loss(y_val, val_probs)

print("\n‚úÖ XGBoost Model Performance (with Early Stopping):")
print(f"Training Accuracy : {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Training F1 Score  : {train_f1:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")
print(f"Training Log Loss  : {train_logloss:.4f}")
print(f"Validation Log Loss: {val_logloss:.4f}")
print(f"Œî F1 Gap           : {abs(train_f1 - val_f1):.4f}")
print(f"Œî LogLoss Gap      : {abs(train_logloss - val_logloss):.4f}")

print("\nClassification Report (Validation Set):")
print(classification_report(y_val, val_preds))


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [23]:
# %% [Stacking Ensemble Model - Enhanced]

from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, classification_report

# --- Define Base Models (using tuned params if available) ---
base_models = [
    ("xgb", XGBClassifier(
        n_estimators=109,
        learning_rate=0.0504,
        max_depth=4,
        subsample=0.7587,
        colsample_bytree=0.8293,
        gamma=0.4669,
        min_child_weight=5,
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    )),
    ("lgbm", LGBMClassifier(
        n_estimators=103,
        learning_rate=0.0158,
        max_depth=7,
        subsample=0.9337,
        colsample_bytree=0.9123,
        min_child_samples=73,
        random_state=42,
        n_jobs=-1
    )),
    ("cat", CatBoostClassifier(
        iterations=227,
        learning_rate=0.0557,
        depth=7,
        l2_leaf_reg=4.52,
        verbose=0,
        random_seed=42
    )),
    ("rf", RandomForestClassifier(
        n_estimators=747,
        max_depth=9,
        min_samples_split=6,
        min_samples_leaf=5,
        max_features=None,
        random_state=42,
        n_jobs=-1
    ))
]

# --- Define Meta (Blender) Model ---
meta_model = LogisticRegression(
    max_iter=2000,
    multi_class="multinomial",
    solver="lbfgs",
    random_state=42
)

# --- Build the Stacking Model ---
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# --- Train Ensemble ---
stacking_model.fit(X_train, y_train)

# --- Evaluate ---
train_preds = stacking_model.predict(X_train)
val_preds = stacking_model.predict(X_val)
train_probs = stacking_model.predict_proba(X_train)
val_probs = stacking_model.predict_proba(X_val)

train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, val_preds)
train_f1 = f1_score(y_train, train_preds, average='macro')
val_f1 = f1_score(y_val, val_preds, average='macro')
train_logloss = log_loss(y_train, train_probs)
val_logloss = log_loss(y_val, val_probs)

print("\n Stacking Ensemble Performance:")
print(f"Training Accuracy : {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Training F1 Score  : {train_f1:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")
print(f"Training Log Loss  : {train_logloss:.4f}")
print(f"Validation Log Loss: {val_logloss:.4f}")
print(f"Œî F1 Gap           : {abs(train_f1 - val_f1):.4f}")
print(f"Œî LogLoss Gap      : {abs(train_logloss - val_logloss):.4f}")

print("\nClassification Report (Validation Set):")
print(classification_report(y_val, val_preds))


# --- Train Ensemble ---
stacking_model.fit(X_train, y_train)

# --- Evaluate ---
train_preds = stacking_model.predict(X_train)
test_preds = stacking_model.predict(X_test)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)
train_f1 = f1_score(y_train, train_preds, average="macro")
test_f1 = f1_score(y_test, test_preds, average="macro")

print("\n Stacking Ensemble Performance:")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy     : {test_acc:.4f}")
print(f"Training F1 Score : {train_f1:.4f}")
print(f"Test F1 Score     : {test_f1:.4f}")





‚úÖ Stacking Ensemble Performance:
Training Accuracy : 0.9553
Validation Accuracy: 0.9356
Training F1 Score  : 0.9379
Validation F1 Score: 0.9098
Training Log Loss  : 0.1380
Validation Log Loss: 0.1895
Œî F1 Gap           : 0.0281
Œî LogLoss Gap      : 0.0514

Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       628
           1       0.95      0.97      0.96      2546
           2       0.91      0.86      0.89       457

    accuracy                           0.94      3631
   macro avg       0.92      0.90      0.91      3631
weighted avg       0.94      0.94      0.94      3631



KeyboardInterrupt: 

In [44]:
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

def tune_model_with_optuna(model_name, n_trials=30):
    # --- Split train into sub-train/validation ---
    X_subtrain, X_val, y_subtrain, y_val = train_test_split(
        X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
    )

    def objective(trial):
        # XGBoost
        if model_name == "xgb":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
                "max_depth": trial.suggest_int("max_depth", 3, 7),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "gamma": trial.suggest_float("gamma", 0.0, 2.0),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 6),
                "n_jobs": -1,
                "random_state": 42,
                "eval_metric": "mlogloss",
                "use_label_encoder": False,
                "objective": "multi:softprob",
                "num_class": len(np.unique(y_train))
            }
            model = XGBClassifier(**params)

        # LightGBM
        elif model_name == "lgbm":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
                "max_depth": trial.suggest_int("max_depth", 3, 7),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
                "random_state": 42,
                "n_jobs": -1
            }
            model = LGBMClassifier(**params)

        # CatBoost
        elif model_name == "cat":
            params = {
                "iterations": trial.suggest_int("iterations", 100, 500),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
                "depth": trial.suggest_int("depth", 3, 7),
                "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
                "random_seed": 42,
                "verbose": 0
            }
            model = CatBoostClassifier(**params)

        # Random Forest
        elif model_name == "rf":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "max_depth": trial.suggest_int("max_depth", 2, 7),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
                "n_jobs": -1,
                "random_state": 42
            }
            model = RandomForestClassifier(**params)

        # --- Train & Evaluate ---
        model.fit(X_subtrain, y_subtrain)
        val_preds = model.predict(X_val)

        val_f1 = f1_score(y_val, val_preds, average="macro")

        # Objective: directly maximize F1
        return val_f1

    # --- Run Optuna study ---
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    # --- Results ---
    print(f"\n‚úÖ Best {model_name.upper()} Params:")
    print(study.best_params)
    print(f"Best F1 Score: {study.best_value:.4f}")

    return study


In [None]:
# XGBoost
# xgb_study = tune_model_with_optuna("xgb", n_trials=30) considerable

# LightGBM
# lgbm_study = tune_model_with_optuna("lgbm", n_trials=30) considerable 

# # CatBoost
# cat_study = tune_model_with_optuna("cat", n_trials=30)

# # Random Forest
# rf_study = tune_model_with_optuna("rf", n_trials=30)

[I 2025-10-30 20:03:21,067] A new study created in memory with name: no-name-014aba8a-bae4-4e19-ba08-d854081061e8


  0%|          | 0/30 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:03:27,627] Trial 0 finished with value: 0.9170495113253788 and parameters: {'n_estimators': 250, 'learning_rate': 0.09556428757689246, 'max_depth': 6, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.3119890406724053, 'min_child_weight': 1}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:03:35,061] Trial 1 finished with value: 0.9158760157328456 and parameters: {'n_estimators': 447, 'learning_rate': 0.0641003510568888, 'max_depth': 6, 'subsample': 0.5102922471479012, 'colsample_bytree': 0.9849549260809971, 'gamma': 1.6648852816008435, 'min_child_weight': 2}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:03:38,287] Trial 2 finished with value: 0.905301111325278 and parameters: {'n_estimators': 172, 'learning_rate': 0.026506405886809047, 'max_depth': 4, 'subsample': 0.762378215816119, 'colsample_bytree': 0.7159725093210578, 'gamma': 0.5824582803960838, 'min_child_weight': 4}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:03:41,206] Trial 3 finished with value: 0.9069651726031239 and parameters: {'n_estimators': 155, 'learning_rate': 0.03629301836816964, 'max_depth': 4, 'subsample': 0.728034992108518, 'colsample_bytree': 0.8925879806965068, 'gamma': 0.39934756431671947, 'min_child_weight': 4}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:03:48,447] Trial 4 finished with value: 0.9124746197810897 and parameters: {'n_estimators': 337, 'learning_rate': 0.014180537144799797, 'max_depth': 6, 'subsample': 0.5852620618436457, 'colsample_bytree': 0.5325257964926398, 'gamma': 1.8977710745066665, 'min_child_weight': 6}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:03:56,331] Trial 5 finished with value: 0.913653300738651 and parameters: {'n_estimators': 424, 'learning_rate': 0.037415239225603365, 'max_depth': 3, 'subsample': 0.8421165132560784, 'colsample_bytree': 0.7200762468698007, 'gamma': 0.24407646968955765, 'min_child_weight': 3}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:03:59,157] Trial 6 finished with value: 0.9152525381958855 and parameters: {'n_estimators': 113, 'learning_rate': 0.09183883618709039, 'max_depth': 4, 'subsample': 0.831261142176991, 'colsample_bytree': 0.6558555380447055, 'gamma': 1.0401360423556216, 'min_child_weight': 4}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:03,758] Trial 7 finished with value: 0.9135406748130906 and parameters: {'n_estimators': 174, 'learning_rate': 0.09726261649881028, 'max_depth': 6, 'subsample': 0.9697494707820946, 'colsample_bytree': 0.9474136752138245, 'gamma': 1.1957999576221703, 'min_child_weight': 6}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:06,066] Trial 8 finished with value: 0.8942433320959274 and parameters: {'n_estimators': 135, 'learning_rate': 0.027638457617723072, 'max_depth': 3, 'subsample': 0.6626651653816322, 'colsample_bytree': 0.6943386448447411, 'gamma': 0.5426980635477918, 'min_child_weight': 5}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:11,932] Trial 9 finished with value: 0.9167857178522567 and parameters: {'n_estimators': 243, 'learning_rate': 0.03528410587186427, 'max_depth': 5, 'subsample': 0.5704621124873813, 'colsample_bytree': 0.9010984903770198, 'gamma': 0.14910128735954165, 'min_child_weight': 6}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:17,577] Trial 10 finished with value: 0.9137099946810389 and parameters: {'n_estimators': 315, 'learning_rate': 0.07355202867601568, 'max_depth': 7, 'subsample': 0.9661451709558936, 'colsample_bytree': 0.5076838686640521, 'gamma': 0.7960866844403536, 'min_child_weight': 1}. Best is trial 0 with value: 0.9170495113253788.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:23,569] Trial 11 finished with value: 0.9185590103473457 and parameters: {'n_estimators': 242, 'learning_rate': 0.05451798712798489, 'max_depth': 5, 'subsample': 0.6393145620149152, 'colsample_bytree': 0.8485779491700473, 'gamma': 0.04940924881227371, 'min_child_weight': 1}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:29,284] Trial 12 finished with value: 0.9150178952520784 and parameters: {'n_estimators': 237, 'learning_rate': 0.056386597372166704, 'max_depth': 5, 'subsample': 0.6779623244600864, 'colsample_bytree': 0.7950881832821013, 'gamma': 0.007463514646734178, 'min_child_weight': 1}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:38,944] Trial 13 finished with value: 0.914950839931814 and parameters: {'n_estimators': 238, 'learning_rate': 0.07944977289312445, 'max_depth': 7, 'subsample': 0.85058129769688, 'colsample_bytree': 0.8036067814232024, 'gamma': 0.006632540865966756, 'min_child_weight': 2}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:49,167] Trial 14 finished with value: 0.9155406008819992 and parameters: {'n_estimators': 361, 'learning_rate': 0.051088902098586644, 'max_depth': 6, 'subsample': 0.7573861576859734, 'colsample_bytree': 0.6044566965112071, 'gamma': 1.3268128572239652, 'min_child_weight': 2}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:04:55,475] Trial 15 finished with value: 0.9157143583493857 and parameters: {'n_estimators': 260, 'learning_rate': 0.07568513465785312, 'max_depth': 5, 'subsample': 0.9040330172235821, 'colsample_bytree': 0.8091883372850146, 'gamma': 0.7347577744400322, 'min_child_weight': 1}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:06,292] Trial 16 finished with value: 0.9150679677162845 and parameters: {'n_estimators': 284, 'learning_rate': 0.04883308238585699, 'max_depth': 7, 'subsample': 0.6924745509172271, 'colsample_bytree': 0.596609562772596, 'gamma': 0.34173833856231534, 'min_child_weight': 3}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:14,636] Trial 17 finished with value: 0.9143662904394612 and parameters: {'n_estimators': 382, 'learning_rate': 0.08719593104665169, 'max_depth': 6, 'subsample': 0.6213946636000343, 'colsample_bytree': 0.775363380915974, 'gamma': 0.8337443690283718, 'min_child_weight': 2}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:19,766] Trial 18 finished with value: 0.9167994896858834 and parameters: {'n_estimators': 204, 'learning_rate': 0.06633981323438215, 'max_depth': 5, 'subsample': 0.791412280667512, 'colsample_bytree': 0.8609727402803514, 'gamma': 0.5369581152388117, 'min_child_weight': 1}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:25,789] Trial 19 finished with value: 0.9172077493871348 and parameters: {'n_estimators': 294, 'learning_rate': 0.062138220164854104, 'max_depth': 5, 'subsample': 0.9018049454302455, 'colsample_bytree': 0.5883435592119127, 'gamma': 0.24460496092909467, 'min_child_weight': 3}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:31,865] Trial 20 finished with value: 0.9180553805566144 and parameters: {'n_estimators': 494, 'learning_rate': 0.058703450391223405, 'max_depth': 4, 'subsample': 0.8979338809467865, 'colsample_bytree': 0.6455326383677563, 'gamma': 1.485151668905378, 'min_child_weight': 3}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:38,604] Trial 21 finished with value: 0.9166321749188838 and parameters: {'n_estimators': 483, 'learning_rate': 0.060184053781853285, 'max_depth': 4, 'subsample': 0.9097877650610291, 'colsample_bytree': 0.6477234193807914, 'gamma': 1.44396763782538, 'min_child_weight': 3}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:44,109] Trial 22 finished with value: 0.912970965126144 and parameters: {'n_estimators': 299, 'learning_rate': 0.04691983803303788, 'max_depth': 5, 'subsample': 0.9061080132164645, 'colsample_bytree': 0.6451371648959269, 'gamma': 1.6310394253424867, 'min_child_weight': 5}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:50,172] Trial 23 finished with value: 0.9156086895189731 and parameters: {'n_estimators': 389, 'learning_rate': 0.07057555087970209, 'max_depth': 4, 'subsample': 0.9989244219096844, 'colsample_bytree': 0.5497267584916317, 'gamma': 1.1051408114936678, 'min_child_weight': 3}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:05:57,076] Trial 24 finished with value: 0.9129500045727156 and parameters: {'n_estimators': 498, 'learning_rate': 0.0443380697648999, 'max_depth': 5, 'subsample': 0.88372204022497, 'colsample_bytree': 0.8566298325921834, 'gamma': 1.9468582820569833, 'min_child_weight': 5}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:06:00,124] Trial 25 finished with value: 0.9099641174669201 and parameters: {'n_estimators': 197, 'learning_rate': 0.05519621491059125, 'max_depth': 3, 'subsample': 0.9357575045838521, 'colsample_bytree': 0.7495815302113988, 'gamma': 0.15478316428385108, 'min_child_weight': 2}. Best is trial 11 with value: 0.9185590103473457.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:06:05,087] Trial 26 finished with value: 0.9200108436344022 and parameters: {'n_estimators': 333, 'learning_rate': 0.06376514010958909, 'max_depth': 4, 'subsample': 0.7105576225418185, 'colsample_bytree': 0.6803222620640019, 'gamma': 1.4554751073424212, 'min_child_weight': 3}. Best is trial 26 with value: 0.9200108436344022.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:06:10,940] Trial 27 finished with value: 0.9177248371986106 and parameters: {'n_estimators': 436, 'learning_rate': 0.08583201625404041, 'max_depth': 4, 'subsample': 0.6285836974809043, 'colsample_bytree': 0.6736701896523487, 'gamma': 1.5146295322867214, 'min_child_weight': 4}. Best is trial 26 with value: 0.9200108436344022.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:06:16,734] Trial 28 finished with value: 0.913451878606895 and parameters: {'n_estimators': 402, 'learning_rate': 0.04301447867794913, 'max_depth': 3, 'subsample': 0.722941617844659, 'colsample_bytree': 0.7512979163480473, 'gamma': 1.284535246652175, 'min_child_weight': 2}. Best is trial 26 with value: 0.9200108436344022.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-10-30 20:06:21,579] Trial 29 finished with value: 0.9205029567214768 and parameters: {'n_estimators': 335, 'learning_rate': 0.06869744187596126, 'max_depth': 4, 'subsample': 0.5268577734910248, 'colsample_bytree': 0.6233187244227066, 'gamma': 1.7743397533828527, 'min_child_weight': 3}. Best is trial 29 with value: 0.9205029567214768.

‚úÖ Best XGB Params:
{'n_estimators': 335, 'learning_rate': 0.06869744187596126, 'max_depth': 4, 'subsample': 0.5268577734910248, 'colsample_bytree': 0.6233187244227066, 'gamma': 1.7743397533828527, 'min_child_weight': 3}
Best F1 Score: 0.9205


In [None]:
# ‚úÖ Best XGB Params:
# {'n_estimators': 600, 'learning_rate': 0.09905399158869002, 'max_depth': 6, 'subsample': 0.6146009721997969, 'colsample_bytree': 0.9828472877209867, 'gamma': 1.5294297733838427, 'min_child_weight': 1}
# Best Objective Value: 0.0686

# ‚úÖ Best LGBM Params:
# {'n_estimators': 316, 'learning_rate': 0.06533353663762796, 'max_depth': 7, 'subsample': 0.569746930326021, 'colsample_bytree': 0.6460723242676091, 'min_child_samples': 43}
# Best Objective Value: 0.0701

# ‚úÖ Best CAT Params:
# {'iterations': 496, 'learning_rate': 0.055895437266264544, 'depth': 7, 'l2_leaf_reg': 8.50438821157523}
# Best Objective Value: 0.0693

# ‚úÖ Best RF Params:
# {'n_estimators': 414, 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'log2'}
# Best Objective Value: 0.0732


In [33]:
# %% [Stacking Ensemble Model - Tuned Base Models]

from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# --- Define Tuned Base Models ---
base_models = [
    ("xgb", XGBClassifier(
        n_estimators=600,
        learning_rate=0.09905399158869002,
        max_depth=6,
        subsample=0.6146009721997969,
        colsample_bytree=0.9828472877209867,
        gamma=1.5294297733838427,
        min_child_weight=1,
        random_state=42,
        n_jobs=-1,
        eval_metric="mlogloss",
        objective="multi:softprob",
        use_label_encoder=False,
        num_class=len(np.unique(y_train))
    )),
    ("lgbm", LGBMClassifier(
        n_estimators=316,
        learning_rate=0.06533353663762796,
        max_depth=7,
        subsample=0.569746930326021,
        colsample_bytree=0.6460723242676091,
        min_child_samples=43,
        random_state=42,
        n_jobs=-1
    )),
    ("cat", CatBoostClassifier(
        iterations=496,
        learning_rate=0.055895437266264544,
        depth=7,
        l2_leaf_reg=8.50438821157523,
        verbose=0,
        random_seed=42
    )),
    ("rf", RandomForestClassifier(
        n_estimators=414,
        max_depth=12,
        min_samples_split=6,
        min_samples_leaf=2,
        max_features='log2',
        random_state=42,
        n_jobs=-1
    ))
]

# --- Meta Model (Blender) ---
meta_model = LogisticRegression(
    C=0.3,
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    random_state=42
)

# --- Build Stacking Ensemble ---
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# --- Train Ensemble ---
stacking_model.fit(X_train, y_train)

# --- Evaluate ---
train_preds = stacking_model.predict(X_train)
val_preds = stacking_model.predict(X_val)

train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, val_preds)
train_f1 = f1_score(y_train, train_preds, average="macro")
val_f1 = f1_score(y_val, val_preds, average="macro")

print("\n‚úÖ Stacking Ensemble Performance:")
print(f"Training Accuracy : {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Training F1 Score  : {train_f1:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")
print(f"Œî F1 Gap           : {abs(train_f1 - val_f1):.4f}")





‚úÖ Stacking Ensemble Performance:
Training Accuracy : 0.9756
Validation Accuracy: 0.9361
Training F1 Score  : 0.9659
Validation F1 Score: 0.9098
Œî F1 Gap           : 0.0561


In [None]:
# %% [Stacking Ensemble Model - Tuned Base Models with Ridge Meta-Model]

from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import RidgeClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# --- Define Tuned Base Models ---
base_models = [
    ("xgb", XGBClassifier(
        n_estimators=600,
        learning_rate=0.09905399158869002,
        max_depth=6,
        subsample=0.6146009721997969,
        colsample_bytree=0.9828472877209867,
        gamma=1.5294297733838427,
        min_child_weight=1,
        random_state=42,
        n_jobs=-1,
        eval_metric="mlogloss",
        objective="multi:softprob",
        use_label_encoder=False,
        num_class=len(np.unique(y_train))
    )),
    # ("lgbm", LGBMClassifier(
    #     n_estimators=316,
    #     learning_rate=0.06533353663762796,
    #     max_depth=7,
    #     subsample=0.569746930326021,
    #     colsample_bytree=0.6460723242676091,
    #     min_child_samples=43,
    #     random_state=42,
    #     n_jobs=-1
    # )),
    # ("cat", CatBoostClassifier(
    #     iterations=496,
    #     learning_rate=0.055895437266264544,
    #     depth=7,
    #     l2_leaf_reg=8.50438821157523,
    #     verbose=0,
    #     random_seed=42
    # )),
    # ("rf", RandomForestClassifier(
    #     n_estimators=414,
    #     max_depth=12,
    #     min_samples_split=6,
    #     min_samples_leaf=2,
    #     max_features='log2',
    #     random_state=42,
    #     n_jobs=-1
    # ))
]

# --- Meta Model (Ridge Classifier with CV) ---
meta_model = RidgeClassifierCV(
    alphas=np.logspace(-3, 3, 10),  # range of regularization strengths
    cv=5
)

# --- Build Stacking Ensemble ---
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# --- Train Ensemble ---
stacking_model.fit(X_train, y_train)

# --- Evaluate ---
train_preds = stacking_model.predict(X_train)
val_preds = stacking_model.predict(X_val)

train_acc = accuracy_score(y_train, train_preds)
val_acc = accuracy_score(y_val, val_preds)
train_f1 = f1_score(y_train, train_preds, average="macro")
val_f1 = f1_score(y_val, val_preds, average="macro")

print("\n‚úÖ Stacking Ensemble Performance (Ridge Meta-Model):")
print(f"Training Accuracy : {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Training F1 Score  : {train_f1:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")
print(f"Œî F1 Gap           : {abs(train_f1 - val_f1):.4f}")



‚úÖ Stacking Ensemble Performance (Ridge Meta-Model):
Training Accuracy : 0.9861
Validation Accuracy: 0.9383
Training F1 Score  : 0.9805
Validation F1 Score: 0.9145
Œî F1 Gap           : 0.0661
