In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ==========================================
# 1. METRICS & CONFIGURATION
# ==========================================

def frost_metrics_regression_only(df: pd.DataFrame):
    """Calculates MAE, RMSE, and Bias for the combined predictions."""
    y_true = df["y_temp"].values
    y_pred = df["yhat_temp"].values

    mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    y_true, y_pred = y_true[mask], y_pred[mask]

    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    bias = float((y_pred - y_true).mean())

    return {"mae": mae, "rmse": rmse, "bias": bias, "n_samples": len(y_true)}

CONFIG = {
    'horizon': [3, 6, 12, 24],
    'frost_threshold': 0.0,
    # NOTE: Assuming you have access to both original files
    'train_file_path': '/content/train_set_filled_w_Mean_cleaned.csv',
    'test_file_path': '/content/test_set_filled_w_Mean_cleaned.csv',
}

# --- Tuned Hyperparameters (Focusing on Regression) ---
# We will use the best regressor params found previously for simplicity.
TUNED_REG_PARAMS = {
    3: {'alpha': 0.05808, 'colsample_bytree': 0.94647, 'lambda': 0.60111, 'learning_rate': 0.15161, 'max_depth': 8, 'n_estimators': 102, 'subsample': 0.98796},
    6: {'alpha': 0.05808, 'colsample_bytree': 0.94647, 'lambda': 0.60111, 'learning_rate': 0.15161, 'max_depth': 8, 'n_estimators': 102, 'subsample': 0.98796},
    12: {'alpha': 0.05808, 'colsample_bytree': 0.94647, 'lambda': 0.60111, 'learning_rate': 0.15161, 'max_depth': 8, 'n_estimators': 102, 'subsample': 0.98796},
    24: {'alpha': 0.23089, 'colsample_bytree': 0.69641, 'lambda': 0.68326, 'learning_rate': 0.13199, 'max_depth': 9, 'n_estimators': 84, 'subsample': 0.96372}
}


# ==========================================
# 2. DATA PREPARATION FUNCTIONS
# ==========================================

def load_file(full_path):
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"Could not find file at: {full_path}")
    return pd.read_csv(full_path)

def prepare_base_features(df):
    """Prepares horizon-independent features (cyclical time, lags)."""
    df_copy = df.copy()
    df_copy['datetime'] = pd.to_datetime(df_copy['datetime'], errors='coerce')
    df_copy = df_copy.sort_values(['station_id', 'datetime'])

    # Cyclical Time
    df_copy['hour_sin'] = np.sin(2 * np.pi * df_copy['datetime'].dt.hour / 24)
    df_copy['hour_cos'] = np.cos(2 * np.pi * df_copy['datetime'].dt.hour / 24)

    # Lag Features
    for lag in [1, 3, 6]:
        df_copy[f'temp_lag_{lag}'] = df_copy.groupby('station_id')['air_temp_c'].shift(lag)
        df_copy[f'dew_lag_{lag}'] = df_copy.groupby('station_id')['dew_point_c'].shift(lag)

    return df_copy.dropna()

def generate_targets(df_base, h):
    """Generates horizon-dependent targets (y_temp) for horizon h."""
    df = df_base.copy()
    df = df.sort_values(['station_id', 'datetime'])
    indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=h)

    # Look H hours ahead for the minimum temp
    df['y_temp'] = df.groupby('station_id')['air_temp_c'].transform(
        lambda x: x.rolling(window=indexer, min_periods=1).min()
    )
    # We only need y_temp for the regression generalization test
    return df.dropna().reset_index(drop=True)

# ==========================================
# 3. LOOCV PIPELINE
# ==========================================

def run_pipeline_leave_one_station_out():
    """Runs the full pipeline using the tuned XGBoost Regressor in a LOOCV fashion."""

    # A. Load and Combine Data
    try:
        train_base = load_file(CONFIG['train_file_path'])
        test_base = load_file(CONFIG['test_file_path'])
    except FileNotFoundError as e:
        print(f"Error loading data: {e}")
        return None

    # Combine data and prepare features across the entire set
    print("Combining train and test data...")
    full_data = pd.concat([train_base, test_base], ignore_index=True)
    full_processed_base = prepare_base_features(full_data)

    all_stations = full_processed_base['station_id'].unique()
    num_stations = len(all_stations)
    print(f"Found {num_stations} unique stations: {all_stations}")

    # Features list
    features = ['air_temp_c', 'rel_hum_percent', 'dew_point_c', 'wind_speed_m_s',
                'hour_sin', 'hour_cos', 'temp_lag_1', 'temp_lag_3', 'temp_lag_6']

    all_generalization_results = []

    # Loop through each horizon
    for h_current in CONFIG['horizon']:
        print(f"\n{'='*60}\nProcessing for horizon: {h_current} hours\n{'='*60}")

        # Get Regressor Params
        reg_params = TUNED_REG_PARAMS.get(h_current)
        if not reg_params: continue

        # Generate targets for the current horizon on the full data
        data_h_targets = generate_targets(full_processed_base, h_current)

        station_predictions = []

        # --- Leave-One-Station-Out Loop ---
        for i, station_to_test in enumerate(all_stations):
            print(f"|--- Iteration {i+1}/{num_stations}: Leaving out Station {station_to_test} for testing.")

            # Split data
            train_set = data_h_targets[data_h_targets['station_id'] != station_to_test]
            test_set = data_h_targets[data_h_targets['station_id'] == station_to_test]

            X_train, y_train = train_set[features], train_set['y_temp']
            X_test, y_test = test_set[features], test_set['y_temp']

            # 1. Initialize and Train Regressor
            reg_xgb = xgb.XGBRegressor(**reg_params, n_jobs=-1, random_state=42)

            # Train on 16 stations
            reg_xgb.fit(X_train, y_train)

            # 2. Predict on the unseen station (the 17th station)
            y_pred = reg_xgb.predict(X_test)

            # 3. Store results for the unseen station
            result_df = pd.DataFrame({
                'station_id': station_to_test,
                'horizon_h': h_current,
                'y_temp': y_test.values,
                'yhat_temp': y_pred
            })
            station_predictions.append(result_df)

            # Calculate metrics for the single unseen station
            metrics_single = frost_metrics_regression_only(result_df)
            print(f"|--- Station {station_to_test} Generalization MAE: {metrics_single['mae']:.4f}")

        # --- Aggregate and Final Metrics for the Horizon ---
        combined_results = pd.concat(station_predictions, ignore_index=True)
        final_metrics = frost_metrics_regression_only(combined_results)
        final_metrics['horizon_h'] = h_current
        final_metrics['test_type'] = 'Generalization (LOOCV)'
        all_generalization_results.append(final_metrics)

    # Final Output
    final_df = pd.DataFrame(all_generalization_results)

    print("\n" + "="*70)
    print("FINAL RESULTS: XGBOOST REGRESSION GENERALIZATION (LEAVE-ONE-STATION-OUT)")
    print(f"Test Type: Predicting on a fully UNSEEN Station for each iteration.")
    print("="*70)
    print(final_df[['horizon_h', 'mae', 'rmse', 'bias', 'n_samples']].to_string(index=False))

    return final_df

if __name__ == "__main__":
    final_summary_xgb_loocv = run_pipeline_leave_one_station_out()