In [None]:

import pandas as pd
import numpy as np
import xgboost as xgb
import os
import time
from sklearn.metrics import (
    brier_score_loss, roc_auc_score, average_precision_score,
    mean_absolute_error, mean_squared_error, make_scorer
)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# ==========================================
# 1. METRICS & CONFIGURATION
# ==========================================

# --- Expected Calibration Error (ECE) ---
def expected_calibration_error(y_true, y_prob, n_bins=10):
    """
    Helper for ECE calculation.
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    binids = np.digitize(y_prob, bins) - 1

    ece = 0.0
    for i in range(n_bins):
        idx = binids == i
        if np.any(idx):
            acc_in_bin = y_true[idx].mean()
            conf_in_bin = y_prob[idx].mean()
            prop_in_bin = idx.mean()
            ece += np.abs(acc_in_bin - conf_in_bin) * prop_in_bin
    return ece

def frost_metrics(df: pd.DataFrame):
    """
    Generates the F3 Challenge Metric Table (Summary only for brevity).
    """
    rows = []
    for h, g in df.groupby("horizon_h", sort=True):
        y, p = g["y_event"].values, g["p_event"].values

        # Safety check for NaNs
        mask = ~np.isnan(y) & ~np.isnan(p)
        y, p = y[mask], p[mask]
        g = g[mask]

        if len(y) == 0: continue

        # Classification Metrics
        brier = brier_score_loss(y, np.clip(p, 0, 1))
        ece   = expected_calibration_error(y, p, n_bins=10)

        if len(np.unique(y)) < 2:
            roc = np.nan
            pr   = np.nan
        else:
            try: roc = roc_auc_score(y, p)
            except: roc = np.nan
            try: pr  = average_precision_score(y, p)
            except: pr  = np.nan

        # Regression Metrics
        mae  = float(mean_absolute_error(g["y_temp"], g["yhat_temp"]))
        rmse = float(np.sqrt(mean_squared_error(g["y_temp"], g["yhat_temp"])))
        bias = float((g["yhat_temp"] - g["y_temp"]).mean())

        rows.append({
            "horizon_h": int(h),
            "brier": brier, "ece": ece, "roc_auc": roc, "pr_auc": pr,
            "mae": mae, "rmse": rmse, "bias": bias,
            "n_samples": int(len(g))
        })
    return pd.DataFrame(rows).sort_values("horizon_h") # Returns only the summary

# --- Configuration ---
CONFIG = {
    'horizon': [3, 6, 12, 24],
    'frost_threshold': 0.0,
    # NOTE: Update these paths if your files are located elsewhere
    'train_file_path': '/content/train_set_filled_w_Mean_cleaned.csv',
    'test_file_path': '/content/test_set_filled_w_Mean_cleaned.csv',
    'TUNING_SAMPLE_FRACTION': 0.1, # 10% sample for tuning
    'N_TUNING_ITERATIONS': 10,     # Number of parameter settings to sample
    'N_CV_FOLDS': 3                # Number of cross-validation folds
}

# ==========================================
# 2. DATA PREPARATION FUNCTIONS
# ==========================================

def load_file(full_path):
    """Loads a file given its full path."""
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"Could not find file at: {full_path}")
    print(f"Loading {full_path}...")
    # Infer file type (simplified from original)
    return pd.read_csv(full_path)

def prepare_base_features(df):
    """Prepares horizon-independent features (cyclical time, lags)."""
    df_copy = df.copy()
    df_copy['datetime'] = pd.to_datetime(df_copy['datetime'], errors='coerce')
    df_copy = df_copy.sort_values(['station_id', 'datetime'])

    # Cyclical Time
    df_copy['hour_sin'] = np.sin(2 * np.pi * df_copy['datetime'].dt.hour / 24)
    df_copy['hour_cos'] = np.cos(2 * np.pi * df_copy['datetime'].dt.hour / 24)

    # Lag Features
    for lag in [1, 3, 6]:
        df_copy[f'temp_lag_{lag}'] = df_copy.groupby('station_id')['air_temp_c'].shift(lag)
        df_copy[f'dew_lag_{lag}'] = df_copy.groupby('station_id')['dew_point_c'].shift(lag)

    return df_copy.dropna()

def generate_targets(df_base, h):
    """Generates horizon-dependent targets (y_temp, y_event) for horizon h."""
    df = df_base.copy()
    df = df.sort_values(['station_id', 'datetime'])
    indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=h)

    # Look H hours ahead for the minimum temp
    df['y_temp'] = df.groupby('station_id')['air_temp_c'].transform(
        lambda x: x.rolling(window=indexer, min_periods=1).min()
    )
    df['y_event'] = (df['y_temp'] <= CONFIG['frost_threshold']).astype(int)
    return df.dropna()

# ==========================================
# 3. XGBOOST TUNING FUNCTION
# ==========================================

def tune_xgboost(X_train, y_clf, y_reg, features):
    """
    Performs RandomizedSearchCV for both XGBoost Classifier and Regressor.
    """
    print("\n--- Starting XGBoost Hyperparameter Tuning (Randomized Search) ---")

    # 1. Classifier (Frost Event) Tuning
    print("Tuning Frost Classifier...")
    clf_xgb = xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False, eval_metric='logloss')

    clf_param_dist = {
        'n_estimators': randint(50, 200),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': uniform(0, 5)
    }

    clf_search = RandomizedSearchCV(
        estimator=clf_xgb,
        param_distributions=clf_param_dist,
        n_iter=CONFIG['N_TUNING_ITERATIONS'],
        scoring='roc_auc',
        cv=CONFIG['N_CV_FOLDS'],
        verbose=0,
        random_state=42,
        n_jobs=-1
    )

    start_time = time.time()
    clf_search.fit(X_train[features], y_clf)
    end_time = time.time()

    print(f"Classifier Tuning finished in {end_time - start_time:.2f} seconds.")
    print(f"Best Classifier Params: {clf_search.best_params_}")
    best_clf = clf_search.best_estimator_

    # 2. Regressor (Temperature) Tuning
    print("\nTuning Temperature Regressor...")
    reg_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=42)

    reg_param_dist = {
        'n_estimators': randint(50, 200),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'lambda': uniform(0, 1),
        'alpha': uniform(0, 1)
    }

    # Use Negative MAE as the scoring metric for regression
    neg_mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    reg_search = RandomizedSearchCV(
        estimator=reg_xgb,
        param_distributions=reg_param_dist,
        n_iter=CONFIG['N_TUNING_ITERATIONS'],
        scoring=neg_mae_scorer,
        cv=CONFIG['N_CV_FOLDS'],
        verbose=0,
        random_state=42,
        n_jobs=-1
    )

    start_time = time.time()
    reg_search.fit(X_train[features], y_reg)
    end_time = time.time()

    print(f"Regressor Tuning finished in {end_time - start_time:.2f} seconds.")
    print(f"Best Regressor Params: {reg_search.best_params_}")
    best_reg = reg_search.best_estimator_

    return best_clf, best_reg

# ==========================================
# 4. MAIN PIPELINE (XGBoost Only)
# ==========================================

def run_pipeline_xgb_only():
    """Runs the full pipeline using only XGBoost models with tuning."""
    # A. Load base data
    try:
        train_base = load_file(CONFIG['train_file_path'])
        test_base = load_file(CONFIG['test_file_path'])
    except FileNotFoundError as e:
        print(f"Error loading data: {e}")
        return None

    # B. Prepare common features
    print("Preparing base features (cyclical time, lags)...")
    train_processed_base = prepare_base_features(train_base)
    test_processed_base = prepare_base_features(test_base)

    all_summaries_xgb = []

    # Loop through each horizon
    for h_current in CONFIG['horizon']:
        print(f"\nProcessing for horizon: {h_current} hours")

        # Generate targets for the current horizon
        print(f"Generating targets for h={h_current}...")
        train_h_targets = generate_targets(train_processed_base, h_current)
        test_h_targets = generate_targets(test_processed_base, h_current)

        # Features
        features = ['air_temp_c', 'rel_hum_percent', 'dew_point_c', 'wind_speed_m_s',
                    'hour_sin', 'hour_cos', 'temp_lag_1', 'temp_lag_3', 'temp_lag_6']
        features = [f for f in features if f in train_h_targets.columns]
        print(f"Training with features: {features}")

        # --- Subsample the training data for faster tuning ---
        if CONFIG['TUNING_SAMPLE_FRACTION'] < 1.0:
            print(f"Sampling {CONFIG['TUNING_SAMPLE_FRACTION'] * 100:.0f}% of training data for XGBoost tuning.")
            train_subsample = train_h_targets.sample(frac=CONFIG['TUNING_SAMPLE_FRACTION'], random_state=42)
        else:
            train_subsample = train_h_targets.copy()

        # --- XGBoost Models (TUNED) ---
        best_clf_xgb, best_reg_xgb = tune_xgboost(
            train_subsample,
            train_subsample['y_event'],
            train_subsample['y_temp'],
            features
        )

        # 1. Train the final models on the FULL training set
        # Although tuning was done on a subsample, we should re-train the final chosen parameters
        # on the full dataset before predicting on the test set for best performance.
        # However, for maximum speed, we will stick to the tuned model trained on the subsample.
        # If performance is key, uncomment the lines below and use full train_h_targets.
        # print("\nRe-training final models on FULL training set...")
        # best_clf_xgb.fit(train_h_targets[features], train_h_targets['y_event'])
        # best_reg_xgb.fit(train_h_targets[features], train_h_targets['y_temp'])

        # 2. Generate predictions
        print("Generating predictions (XGBoost) using best models...")
        test_h_targets['p_event_xgb'] = best_clf_xgb.predict_proba(test_h_targets[features])[:, 1]
        test_h_targets['yhat_temp_xgb'] = best_reg_xgb.predict(test_h_targets[features])

        # 3. Formatting and Metrics
        results_xgb = test_h_targets.copy()
        results_xgb['p_event'] = results_xgb['p_event_xgb']
        results_xgb['yhat_temp'] = results_xgb['yhat_temp_xgb']
        results_xgb['horizon_h'] = h_current
        results_xgb = results_xgb.rename(columns={'datetime': 'timestamp'})

        print("Calculating Final Metrics (XGBoost)...")
        summary_h_xgb = frost_metrics(results_xgb)
        all_summaries_xgb.append(summary_h_xgb)

    # Final Output
    print("\n" + "="*50)
    print(f"FINAL RESULTS FOR ALL HORIZONS (XGBoost - TUNED)")
    print("="*50)
    final_summary_xgb = pd.concat(all_summaries_xgb).reset_index(drop=True)
    print(final_summary_xgb.to_string(index=False))

    return final_summary_xgb

if __name__ == "__main__":
   final_summary_xgb = run_pipeline_xgb_only()
