In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, average_precision_score
)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

import joblib

from sklearn.utils.class_weight import compute_class_weight

In [None]:
# Absolute paths (as you provided)
TRAIN_V02 = "/Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/training-data-v0.2.csv"
TEST_V02  = "/Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/test-data-v0.2.csv"

OUT_TRAIN = "/Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/train_v0.2_simple.csv"
OUT_TEST  = "/Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/test_v0.2_simple.csv"

POWER_COL = "Building_Power_kW"
TEMP_COL  = "Dry_Bulb_Temperature_C"
GHI_COL   = "Global_Horizontal_Radiation_W/m2"
SITE_COL  = "Site"
TIME_COL  = "Timestamp_Local"
FLAG_COL  = "Demand_Response_Flag"  # may be partly null in test

In [None]:
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
     df = df.copy()
     df[TIME_COL] = pd.to_datetime(df[TIME_COL])
 
     df["weekday"] = df[TIME_COL].dt.weekday.astype(int)                 # Mon=0 .. Sun=6
     df["week_of_year"] = df[TIME_COL].dt.isocalendar().week.astype(int)
     df["hour"] = df[TIME_COL].dt.hour.astype(int)
     df["month"] = df[TIME_COL].dt.month.astype(int)
 
     # Parts of day to match your prior columns
     df["part_Morning"]   = ((df["hour"] >= 6)  & (df["hour"] < 12)).astype(int)
     df["part_Afternoon"] = ((df["hour"] >= 12) & (df["hour"] < 18)).astype(int)
 
     # is_saturday like in your previous CSV
     df["is_saturday"] = (df["weekday"] == 5).astype(int)

     # Simple seasonal indicators with names matching your prior columns
     df["intensity_Winter_1"] = df["month"].isin([12, 1, 2]).astype(int)
     df["intensity_Summer_0"] = df["month"].isin([6, 7, 8]).astype(int)
 
     # Active hours (typical working hours 08:00–20:00); adjust if you used a different range
     df["active_hours"] = ((df["hour"] >= 8) & (df["hour"] <= 20)).astype(int)
 
     return df

In [None]:
def add_power_lag_roll_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Ensure time ordering per site
    df = df.sort_values([SITE_COL, TIME_COL])
 
    def _per_site(g: pd.DataFrame) -> pd.DataFrame:
        g = g.copy()
        # Lags
        g["Power_lag_1"] = g[POWER_COL].shift(1)
        g["Power_lag_2"] = g[POWER_COL].shift(2)
        g["Power_lag_4"] = g[POWER_COL].shift(4)
 
        # Fill initial lag NaNs with current power (this matches the look of your prior CSV head)
        for c in ["Power_lag_1", "Power_lag_2", "Power_lag_4"]:
            g[c] = g[c].fillna(g[POWER_COL])
 
        # Rolling mean (window=3, include current; min_periods=1 to avoid NaNs)
        g["power_roll_mean_3"] = (
            g[POWER_COL].rolling(window=3, min_periods=1).mean()
        )
 
#         # First difference
        g["power_diff_1"] = g[POWER_COL].diff(1).fillna(0.0)
 
        return g
 
        df = df.groupby(SITE_COL, group_keys=False).apply(_per_site)
    return df

In [None]:
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df = add_time_features(df)
    df = add_power_lag_roll_features(df)
    df = df.drop(columns=["hour", "month"])
    return df

In [2]:

def main():
    # Read
    train = pd.read_csv(TRAIN_V02)
    test  = pd.read_csv(TEST_V02)

    # Build features
    train_fe = build_features(train)
    test_fe  = build_features(test)

    # Ensure consistent column order: IDs first, then features, then targets if present
    id_cols = [SITE_COL, TIME_COL]
    base_cols = [TEMP_COL, GHI_COL, POWER_COL]
    target_cols = [c for c in [FLAG_COL, "Demand_Response_Capacity_KW", "Demand_Response_Capacity_kW"] if c in train_fe.columns]

    # Move columns into a tidy order for train
    feat_cols_train = [c for c in train_fe.columns if c not in id_cols + target_cols]
    train_fe = train_fe[id_cols + feat_cols_train + target_cols]

    # For test, there is no capacity column; keep FLAG if present
    target_cols_test = [c for c in [FLAG_COL] if c in test_fe.columns]
    feat_cols_test = [c for c in test_fe.columns if c not in id_cols + target_cols_test]
    test_fe = test_fe[id_cols + feat_cols_test + target_cols_test]

    # Write
    train_fe.to_csv(OUT_TRAIN, index=False)
    test_fe.to_csv(OUT_TEST, index=False)

    print(f"[OK] Wrote: {OUT_TRAIN}  (shape={train_fe.shape})")
    print(f"[OK] Wrote: {OUT_TEST}   (shape={test_fe.shape})")


if __name__ == "__main__":
    main()

  df = df.groupby(SITE_COL, group_keys=False).apply(_per_site)
  df = df.groupby(SITE_COL, group_keys=False).apply(_per_site)


[OK] Wrote: /Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/train_v0.2_fe.csv  (shape=(105120, 20))
[OK] Wrote: /Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/test_v0.2_fe.csv   (shape=(105120, 19))


In [None]:
# Lazy imports for optional libs
def _import_xgboost():
    try:
        from xgboost import XGBClassifier
        return XGBClassifier
    except Exception as e:
        raise ImportError("xgboost is not installed. Please run: pip install xgboost") from e

def _import_lightgbm():
    try:
        from lightgbm import LGBMClassifier
        return LGBMClassifier
    except Exception as e:
        raise ImportError("lightgbm is not installed. Please run: pip install lightgbm") from e

In [None]:
# Configuration
TARGET_COL = "Demand_Response_Flag"
DROP_FEATURES = ["Demand_Response_Flag"]

def ensure_output_dir(outdir: str):
    os.makedirs(outdir, exist_ok=True)

def load_data(train_path: str, test_path: str):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    if TARGET_COL not in train.columns:
        raise ValueError(f"Target '{TARGET_COL}' not found in train CSV.")
    return train, test

In [None]:
def prepare_features(train: pd.DataFrame, test: pd.DataFrame):
    """
    Prepares training features (drops target + non-numeric cols).
    """
    # Target and features
    y = train[TARGET_COL].copy()
    X = train.drop(columns=[c for c in DROP_FEATURES if c in train.columns], errors="ignore").copy()

    # Drop object-type columns (e.g., Site, Timestamp_Local)
    obj_cols = list(X.select_dtypes(include=["object"]).columns)
    if obj_cols:
        print(f"[INFO] Dropping object columns: {obj_cols}")
        X = X.drop(columns=obj_cols)

    # Info
    print(f"[INFO] Train shape: X={X.shape}, y={y.shape}")
    print(f"[INFO] Class distribution (train):\n{y.value_counts(normalize=True).sort_index()}")

    return X, y

In [None]:
def chrono_split(df, target_col="Demand_Response_Flag", time_col="Timestamp_Local", val_size=0.2, embargo_hours=0):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(time_col).reset_index(drop=True)

    # Split index
    cutoff_idx = int(len(df) * (1 - val_size))
    
    if embargo_hours > 0:
        cutoff_time = df.loc[cutoff_idx, time_col]
        train = df[df[time_col] < (cutoff_time - pd.Timedelta(hours=embargo_hours))]
        val   = df[df[time_col] >= (cutoff_time + pd.Timedelta(hours=embargo_hours))]
    else:
        train = df.iloc[:cutoff_idx]
        val   = df.iloc[cutoff_idx:]
    
    # Separate features/target
    X_train, y_train = train.drop(columns=[target_col]), train[target_col]
    X_val, y_val     = val.drop(columns=[target_col]), val[target_col]

    # Info
    print("⏳ Chronological split:")
    print(f"Train size: {len(train)}, Validation size: {len(val)}")
    print("Train class distribution:\n", y_train.value_counts(normalize=True).round(3))
    print("Validation class distribution:\n", y_val.value_counts(normalize=True).round(3))

    return X_train, y_train, X_val, y_val

In [None]:
def compute_advanced_class_weights(y_train, strategy='extreme'):
    """
    Compute advanced class weights using different strategies
    """
    values, counts = np.unique(y_train, return_counts=True)
    total = counts.sum()
    
    if strategy == 'balanced':
        # Standard balanced weights
        class_weights = {int(v): float(total / (len(values) * c)) for v, c in zip(values, counts)}
    
    elif strategy == 'extreme':
        # More aggressive weights for extremely imbalanced data
        base_weights = {int(v): float(total / (len(values) * c)) for v, c in zip(values, counts)}
        # Apply additional multipliers for minority classes
        multipliers = {}
        for v, c in zip(values, counts):
            if c < total * 0.01:  # Less than 1% of data
                multipliers[int(v)] = 3.0
            elif c < total * 0.05:  # Less than 5% of data
                multipliers[int(v)] = 2.0
            else:
                multipliers[int(v)] = 0.5  # Reduce majority class weight
        
        class_weights = {k: v * multipliers[k] for k, v in base_weights.items()}
    
    elif strategy == 'custom':
        # Custom strategy based on domain knowledge
        minority_penalty = 50.0
        majority_penalty = 0.3
        
        class_weights = {}
        for v, c in zip(values, counts):
            if c < 500:  # Very rare class
                class_weights[int(v)] = minority_penalty
            elif c > total * 0.8:  # Dominant class  
                class_weights[int(v)] = majority_penalty
            else:
                class_weights[int(v)] = 1.0
    
    print(f"[INFO] Advanced class weights ({strategy}):", class_weights)
    return class_weights

In [None]:
def find_optimal_thresholds(y_true, y_proba, class_labels=[0, 1, 2], metric='f1'):
    """
    Find optimal thresholds for each class using different metrics
    """
    optimal_thresholds = {}
    
    for i, class_label in enumerate(class_labels):
        # Convert to binary problem for this class
        y_binary = (y_true == class_label).astype(int)
        y_scores = y_proba[:, i]
        
        if metric == 'f1':
            precision, recall, thresholds = precision_recall_curve(y_binary, y_scores)
            f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
            optimal_idx = np.argmax(f1_scores)
            optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
            
        elif metric == 'precision_recall_balance':
            precision, recall, thresholds = precision_recall_curve(y_binary, y_scores)
            # Find threshold where precision ≈ recall
            pr_diff = np.abs(precision[:-1] - recall[:-1])
            optimal_idx = np.argmin(pr_diff)
            optimal_threshold = thresholds[optimal_idx]
            
        else:  # Default to 0.5
            optimal_threshold = 0.5
            
        optimal_thresholds[class_label] = optimal_threshold
        
        print(f"[INFO] Optimal threshold for class {class_label}: {optimal_threshold:.4f}")
    
    return optimal_thresholds

In [None]:
def apply_optimal_thresholds(y_proba, thresholds, class_labels=[0, 1, 2]):
    """
    Apply optimal thresholds to get predictions
    """
    n_samples = y_proba.shape[0]
    y_pred = np.zeros(n_samples, dtype=int)
    
    for i, class_label in enumerate(class_labels):
        threshold = thresholds[class_label]
        mask = y_proba[:, i] >= threshold
        y_pred[mask] = class_label
    
    # Handle cases where no class meets threshold or multiple classes meet threshold
    # Use argmax as fallback
    no_prediction_mask = np.sum([y_proba[:, i] >= thresholds[class_labels[i]] 
                                for i in range(len(class_labels))], axis=0) == 0
    
    if np.any(no_prediction_mask):
        y_pred[no_prediction_mask] = np.argmax(y_proba[no_prediction_mask], axis=1)
    
    return y_pred

In [None]:
def hyperparameter_optimization(X_train, y_train, model_type='xgb', class_weights=None, 
                               sample_weight=None, cv_folds=3, n_trials=20):
    """
    Perform hyperparameter optimization using RandomizedSearchCV
    """
    print(f"[INFO] Starting hyperparameter optimization for {model_type}...")
    
    if model_type == 'xgb':
        XGBClassifier = _import_xgboost()
        
        # Parameter grid for XGBoost
        param_grid = {
            'n_estimators': [300, 500, 800, 1200],
            'learning_rate': [0.01, 0.03, 0.05, 0.1],
            'max_depth': [3, 4, 5, 6, 7],
            'min_child_weight': [1, 3, 5, 7],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
            'reg_lambda': [0.1, 0.5, 1.0, 2.0],
            'reg_alpha': [0.0, 0.1, 0.5, 1.0]
        }
        
        base_estimator = XGBClassifier(
            objective='multi:softprob',
            eval_metric='mlogloss',
            tree_method='hist',
            random_state=42
        )
        
        # Custom scoring for imbalanced data
        scoring = 'f1_macro'
        
    elif model_type == 'lgbm':
        LGBMClassifier = _import_lightgbm()
        
        # Parameter grid for LightGBM
        param_grid = {
            'n_estimators': [300, 500, 800, 1200],
            'learning_rate': [0.01, 0.03, 0.05, 0.1],
            'max_depth': [-1, 3, 5, 7],
            'num_leaves': [15, 31, 63, 127],
            'min_child_samples': [20, 50, 100],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
            'reg_lambda': [0.1, 0.5, 1.0, 2.0],
            'reg_alpha': [0.0, 0.1, 0.5, 1.0]
        }
        
        base_params = {
            'objective': 'multiclass',
            'random_state': 42,
            'verbose': -1
        }
        
        if class_weights is not None:
            base_params['class_weight'] = class_weights
        else:
            base_params['is_unbalance'] = True
            
        base_estimator = LGBMClassifier(**base_params)
        scoring = 'f1_macro'
    
    # Perform randomized search
    random_search = RandomizedSearchCV(
        base_estimator,
        param_grid,
        n_iter=n_trials,
        cv=cv_folds,
        scoring=scoring,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    
    # Fit with sample weights if provided (XGBoost)
    if model_type == 'xgb' and sample_weight is not None:
        random_search.fit(X_train, y_train, sample_weight=sample_weight)
    else:
        random_search.fit(X_train, y_train)
    
    print(f"[INFO] Best {model_type} parameters:", random_search.best_params_)
    print(f"[INFO] Best {model_type} CV score: {random_search.best_score_:.4f}")
    
    return random_search.best_estimator_, random_search.best_params_

In [None]:
def train_optimized_xgb(X_train, y_train, sample_weight=None, best_params=None):
    """Train XGBoost with optimized parameters"""
    XGBClassifier = _import_xgboost()
    
    if best_params is None:
        # Default optimized parameters based on typical energy data patterns
        params = {
            'n_estimators': 800,
            'learning_rate': 0.03,
            'max_depth': 5,
            'min_child_weight': 3,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_lambda': 1.0,
            'reg_alpha': 0.1,
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'tree_method': 'hist',
            'random_state': 42
        }
    else:
        params = best_params.copy()
        params.update({
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss', 
            'tree_method': 'hist',
            'random_state': 42
        })
    
    xgb = XGBClassifier(**params)
    xgb.fit(X_train, y_train, sample_weight=sample_weight)
    return xgb

In [None]:
def train_optimized_lgbm(X_train, y_train, class_weights=None, best_params=None):
    """Train LightGBM with optimized parameters"""
    LGBMClassifier = _import_lightgbm()
    
    if best_params is None:
        # Default optimized parameters
        params = {
            'n_estimators': 1000,
            'learning_rate': 0.03,
            'max_depth': -1,
            'num_leaves': 31,
            'min_child_samples': 50,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_lambda': 1.0,
            'reg_alpha': 0.1,
            'objective': 'multiclass',
            'random_state': 42,
            'verbose': -1
        }
    else:
        params = best_params.copy()
        params.update({
            'objective': 'multiclass',
            'random_state': 42,
            'verbose': -1
        })
    
    if class_weights is not None:
        params['class_weight'] = class_weights
    else:
        params['is_unbalance'] = True
    
    lgbm = LGBMClassifier(**params)
    lgbm.fit(X_train, y_train)
    return lgbm

In [None]:
def eval_and_report(y_true, y_pred, y_proba, model_name, outdir, class_labels=[0,1,2], 
                   thresholds=None, y_pred_thresh=None):
    """Enhanced evaluation with threshold optimization results"""
    
    # Standard metrics
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="macro", zero_division=0)
    rec  = recall_score(y_true, y_pred, average="macro", zero_division=0)
    f1   = f1_score(y_true, y_pred, average="macro", zero_division=0)

    # ROC-AUC (macro)
    try:
        y_bin = label_binarize(y_true, classes=class_labels)
        auc_macro = roc_auc_score(y_bin, y_proba, multi_class="ovr", average="macro")
    except Exception:
        auc_macro = np.nan

    print(f"\n=== {model_name} Validation Metrics (Default Threshold) ===")
    print(f"Accuracy        : {acc:.4f}")
    print(f"Macro Precision : {prec:.4f}")
    print(f"Macro Recall    : {rec:.4f}")
    print(f"Macro F1-score  : {f1:.4f}")
    print(f"Macro ROC-AUC   : {auc_macro if not np.isnan(auc_macro) else 'N/A'}")
    
    # Threshold optimized metrics
    acc_thresh = None
    f1_thresh = None
    if y_pred_thresh is not None:
        acc_thresh = accuracy_score(y_true, y_pred_thresh)
        prec_thresh = precision_score(y_true, y_pred_thresh, average="macro", zero_division=0)
        rec_thresh  = recall_score(y_true, y_pred_thresh, average="macro", zero_division=0)
        f1_thresh   = f1_score(y_true, y_pred_thresh, average="macro", zero_division=0)
        
        print(f"\n=== {model_name} Validation Metrics (Optimized Threshold) ===")
        print(f"Accuracy        : {acc_thresh:.4f} (Δ: {acc_thresh-acc:+.4f})")
        print(f"Macro Precision : {prec_thresh:.4f} (Δ: {prec_thresh-prec:+.4f})")
        print(f"Macro Recall    : {rec_thresh:.4f} (Δ: {rec_thresh-rec:+.4f})")
        print(f"Macro F1-score  : {f1_thresh:.4f} (Δ: {f1_thresh-f1:+.4f})")

    print("\nClassification report (Default):\n",
          classification_report(y_true, y_pred, digits=4, zero_division=0))
    
    if y_pred_thresh is not None:
        print("\nClassification report (Optimized):\n",
              classification_report(y_true, y_pred_thresh, digits=4, zero_division=0))

    # Plot confusion matrices
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Default threshold confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=class_labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
    axes[0].set_title(f"{model_name} - Default Threshold")
    axes[0].set_ylabel('True label')
    axes[0].set_xlabel('Predicted label')
    
    # Optimized threshold confusion matrix
    if y_pred_thresh is not None:
        cm_thresh = confusion_matrix(y_true, y_pred_thresh, labels=class_labels)
        sns.heatmap(cm_thresh, annot=True, fmt='d', cmap='Greens', ax=axes[1])
        axes[1].set_title(f"{model_name} - Optimized Threshold")
        axes[1].set_ylabel('True label')
        axes[1].set_xlabel('Predicted label')
    
    plt.tight_layout()
    cm_path = os.path.join(outdir, f"{model_name.replace(' ', '_').lower()}_confusion_matrices.png")
    plt.savefig(cm_path, dpi=200)
    plt.close()

    return {
        "accuracy": acc,
        "precision_macro": prec,
        "recall_macro": rec,
        "f1_macro": f1,
        "roc_auc_macro": auc_macro,
        "accuracy_thresh": acc_thresh,
        "f1_macro_thresh": f1_thresh,
        "cm_path": cm_path
    }

In [19]:
def predict_and_save(model, X_test, outdir, name_prefix, idx_to_label=None, 
                    optimal_thresholds=None, class_labels=[0, 1, 2]):
    """Enhanced prediction with threshold optimization"""
    # Define feature columns (exclude ID cols)
    feature_cols = [c for c in X_test.columns if c not in ["Site", "Timestamp_Local"]]
    
    # Predictions + probabilities
    proba = model.predict_proba(X_test[feature_cols])
    pred_default = model.predict(X_test[feature_cols])
    
    # Apply optimal thresholds if available
    pred_optimized = None
    if optimal_thresholds is not None:
        pred_optimized = apply_optimal_thresholds(proba, optimal_thresholds, class_labels)
    
    # Map back to original labels if needed
    def map_predictions(pred_array):
        if idx_to_label and pred_array is not None:
            return [idx_to_label.get(i, i) for i in pred_array]
        return pred_array
    
    pred_default_mapped = map_predictions(pred_default)
    pred_optimized_mapped = map_predictions(pred_optimized) if pred_optimized is not None else None

    # --- 1) Official submission (default threshold) ---
    df_official = pd.DataFrame({
        "Site": X_test["Site"].values,
        "Timestamp_Local": X_test["Timestamp_Local"].values,
        "Demand_Response_Flag": pred_default_mapped
    })
    out_path_official = os.path.join(outdir, f"{name_prefix}_submission_default.csv")
    df_official.to_csv(out_path_official, index=False)
    print(f"[INFO] Wrote official submission (default) → {out_path_official}")

    # --- 2) Official submission (optimized threshold) ---
    if pred_optimized_mapped is not None:
        df_official_opt = pd.DataFrame({
            "Site": X_test["Site"].values,
            "Timestamp_Local": X_test["Timestamp_Local"].values,
            "Demand_Response_Flag": pred_optimized_mapped
        })
        out_path_official_opt = os.path.join(outdir, f"{name_prefix}_submission_optimized.csv")
        df_official_opt.to_csv(out_path_official_opt, index=False)
        print(f"[INFO] Wrote official submission (optimized) → {out_path_official_opt}")

    # --- 3) Debug submission (with probabilities) ---
    df_debug = df_official.copy()
    if pred_optimized_mapped is not None:
        df_debug["Demand_Response_Flag_Optimized"] = pred_optimized_mapped
    
    for class_index in range(proba.shape[1]):
        label_name = idx_to_label[class_index] if idx_to_label else class_index
        df_debug[f"prob_{label_name}"] = proba[:, class_index]
    
    out_path_debug = os.path.join(outdir, f"{name_prefix}_submission_with_probs.csv")
    df_debug.to_csv(out_path_debug, index=False)
    print(f"[INFO] Wrote debug submission → {out_path_debug}")

    return out_path_official, out_path_debug

In [None]:
def main(train_path=None, test_path=None, outdir="outputs_advanced", val_size=0.2, 
         weight_strategy="extreme", optimize_hyperparams=True, n_trials=20):
    """
    Advanced Energy Classification Pipeline
    """
    
    # Default paths if not provided
    if train_path is None:
        train_path = "/Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/training-data-v0.2.csv"
    if test_path is None:
        test_path = "/Users/ibrahimyucel/Downloads/ULUSLARARASI_ENERJI_YARISMASI/data2/test-data-v0.2.csv"
    
    ensure_output_dir(outdir)
    print(f"[INFO] Advanced Energy Classification Pipeline Started")
    print(f"[INFO] Weight strategy: {weight_strategy}")
    print(f"[INFO] Hyperparameter optimization: {optimize_hyperparams}")

    # --- Load data ---
    train, test = load_data(train_path, test_path)
    X, y = prepare_features(train, test)

    # --- Label mapping: {-1,0,1} → {0,1,2} ---
    label_to_idx = {-1: 0, 0: 1, 1: 2}
    idx_to_label = {v: k for k, v in label_to_idx.items()}
    train[TARGET_COL] = train[TARGET_COL].map(label_to_idx).astype(int)

    # --- Chronological split ---
    X_tr, y_tr, X_val, y_val = chrono_split(
        train,
        target_col=TARGET_COL,
        time_col="Timestamp_Local",
        val_size=val_size,
        embargo_hours=0
    )

    # Drop non-feature ID cols for training
    feature_cols = [c for c in X_tr.columns if c not in ["Site", "Timestamp_Local"]]
    X_tr_features = X_tr[feature_cols]
    X_val_features = X_val[feature_cols]

    # --- Advanced Class Weights ---
    classes = np.unique(y_tr)
    cw_advanced = compute_advanced_class_weights(y_tr, strategy=weight_strategy)
    
    # Standard balanced weights for comparison
    weights_standard = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr)
    cw_standard = dict(zip(classes, weights_standard))
    print("[INFO] Standard balanced weights:", cw_standard)
    
    # Sample weights for XGBoost
    sample_w_advanced = y_tr.map(cw_advanced).values

    # --- Hyperparameter Optimization ---
    if optimize_hyperparams:
        print("\n[INFO] Performing hyperparameter optimization...")
        
        # Optimize XGBoost
        xgb_best, xgb_best_params = hyperparameter_optimization(
            X_tr_features, y_tr, model_type='xgb', 
            sample_weight=sample_w_advanced, n_trials=n_trials
        )
        
        # Optimize LightGBM
        lgbm_best, lgbm_best_params = hyperparameter_optimization(
            X_tr_features, y_tr, model_type='lgbm', 
            class_weights=cw_advanced, n_trials=n_trials
        )
        
        xgb_model = xgb_best
        lgbm_model = lgbm_best
    else:
        print("\n[INFO] Using default optimized parameters...")
        xgb_model = train_optimized_xgb(X_tr_features, y_tr, sample_weight=sample_w_advanced)
        lgbm_model = train_optimized_lgbm(X_tr_features, y_tr, class_weights=cw_advanced)

    # --- Validation Predictions ---
    print("\n[INFO] Getting validation predictions...")
    xgb_proba_val = xgb_model.predict_proba(X_val_features)
    lgbm_proba_val = lgbm_model.predict_proba(X_val_features)

    xgb_pred_val = np.argmax(xgb_proba_val, axis=1)
    lgbm_pred_val = np.argmax(lgbm_proba_val, axis=1)

    # --- Threshold Optimization ---
    print("\n[INFO] Optimizing thresholds...")
    xgb_thresholds = find_optimal_thresholds(y_val, xgb_proba_val, class_labels=[0, 1, 2], metric='f1')
    lgbm_thresholds = find_optimal_thresholds(y_val, lgbm_proba_val, class_labels=[0, 1, 2], metric='f1')

    # Apply optimized thresholds
    xgb_pred_val_thresh = apply_optimal_thresholds(xgb_proba_val, xgb_thresholds, class_labels=[0, 1, 2])
    lgbm_pred_val_thresh = apply_optimal_thresholds(lgbm_proba_val, lgbm_thresholds, class_labels=[0, 1, 2])

    # --- Evaluate models ---
    print("\n[INFO] Evaluating models...")
    xgb_metrics = eval_and_report(y_val, xgb_pred_val, xgb_proba_val, "XGBoost_Advanced", 
                                  outdir, class_labels=[0, 1, 2], 
                                  thresholds=xgb_thresholds, y_pred_thresh=xgb_pred_val_thresh)
    
    lgbm_metrics = eval_and_report(y_val, lgbm_pred_val, lgbm_proba_val, "LightGBM_Advanced", 
                                   outdir, class_labels=[0, 1, 2],
                                   thresholds=lgbm_thresholds, y_pred_thresh=lgbm_pred_val_thresh)

    # --- Save summary ---
    summary_data = []
    summary_data.append({"model": "XGBoost_Advanced", "threshold_type": "default", **xgb_metrics})
    summary_data.append({"model": "LightGBM_Advanced", "threshold_type": "default", **lgbm_metrics})
    
    summary = pd.DataFrame(summary_data)
    summary_path = os.path.join(outdir, "advanced_validation_metrics_summary.csv")
    summary.to_csv(summary_path, index=False)
    print(f"[INFO] Wrote validation metrics summary → {summary_path}")

    # --- Test Predictions ---
    print("\n[INFO] Preparing test predictions...")
    id_cols = ["Site", "Timestamp_Local"] 
    feat_cols = list(X_tr_features.columns)
    X_test_for_pred = test.copy()

    # Align features
    missing = [c for c in feat_cols if c not in X_test_for_pred.columns]
    for c in missing:
        X_test_for_pred[c] = 0

    extra = [c for c in X_test_for_pred.columns if c not in (feat_cols + id_cols)]
    if extra:
        X_test_for_pred = X_test_for_pred.drop(columns=extra)

    X_test_for_pred = X_test_for_pred[id_cols + feat_cols]

    # Make predictions
    predict_and_save(xgb_model, X_test_for_pred, outdir, "xgb_advanced", 
                     idx_to_label, xgb_thresholds, class_labels=[0, 1, 2])
    
    predict_and_save(lgbm_model, X_test_for_pred, outdir, "lgbm_advanced", 
                     idx_to_label, lgbm_thresholds, class_labels=[0, 1, 2])

    # Save additional outputs
    feats_path = os.path.join(outdir, "features_used.txt")
    with open(feats_path, "w", encoding="utf-8") as f:
        for c in feat_cols:
            f.write(f"{c}\n")
    print(f"[INFO] Feature list saved → {feats_path}")

    print(f"\n[DONE] Pipeline completed! All outputs in: {os.path.abspath(outdir)}")