In [None]:
# --- STANDALONE EVALUATION SCRIPT FOR STRONG RF BASELINE ---
# This script loads a pre-trained Random Forest model and evaluates it on the test set,
# calculating the full suite of summary statistics as requested.
# Includes robust data loading to handle inconsistent image dimensions.

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import joblib
from scipy.ndimage import zoom, uniform_filter
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skimage.metrics import structural_similarity as ssim

warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    PROJECT_PATH = Path('/content/drive/My Drive/AR_Downscaling')
except:
    PROJECT_PATH = Path('.') # For local execution

DATA_DIR = PROJECT_PATH / 'final_dataset_multi_variable'
MODEL_DIR = PROJECT_PATH / 'publication_experiments' / 'strong_baseline_rf'
OUTPUT_DIR = MODEL_DIR

TARGET_SHAPE = (256, 256)
STATS = joblib.load(DATA_DIR / 'normalization_stats_multi_variable.joblib')

# --- FEATURE ENGINEERING and DATA PREPARATION (Corrected) ---

def extract_neighborhood_features(X_interp, window_size=5):
    """Extracts rich spatial features from the interpolated predictor grid."""
    print("Extracting spatial neighborhood features...")
    num_samples, num_channels, h, w = X_interp.shape
    num_features = num_channels * 3
    X_pixel_major = X_interp.transpose(0, 2, 3, 1).reshape(num_samples * h * w, num_channels)
    features = np.zeros((num_samples * h * w, num_features), dtype=np.float32)
    features[:, 0:num_channels] = X_pixel_major

    for c in tqdm(range(num_channels), desc="Calculating neighborhood stats"):
        local_mean = uniform_filter(X_interp[:, c], size=window_size)
        local_sq_mean = uniform_filter(X_interp[:, c]**2, size=window_size)
        local_var = local_sq_mean - local_mean**2
        local_std = np.sqrt(np.maximum(local_var, 0))
        features[:, num_channels + c] = local_mean.flatten()
        features[:, (2 * num_channels) + c] = local_std.flatten()

    return features

def load_and_prepare_data(split='test'):
    """Loads data, interpolates it, and extracts features for evaluation."""
    print(f"\n--- Preparing '{split}' data ---")
    split_dir = DATA_DIR / split
    predictor_files = sorted(list(split_dir.glob('*_predictor.npy')))
    num_samples = len(predictor_files)
    coarse_shape = np.load(predictor_files[0]).shape[1:]
    X_coarse = np.zeros((num_samples, 5, *coarse_shape), dtype=np.float32)
    Y_high_res = np.zeros((num_samples, *TARGET_SHAPE), dtype=np.float32)

    for i, pred_path in enumerate(tqdm(predictor_files, desc=f"Loading {split} files")):
        targ_path = Path(str(pred_path).replace('_predictor.npy', '_target.npy'))

        predictor_data = np.load(pred_path)
        target_data = np.load(targ_path)

        # --- ROBUST FIX: Center crop data to the target shape ---
        h, w = target_data.shape
        th, tw = TARGET_SHAPE
        if h != th or w != tw:
            # Calculate starting indices for the crop
            start_h = max(0, (h - th) // 2)
            start_w = max(0, (w - tw) // 2)
            # Perform the crop
            target_data = target_data[start_h : start_h + th, start_w : start_w + tw]

        # Ensure the final shape is correct by padding if necessary (handles smaller images)
        if target_data.shape != TARGET_SHAPE:
             padded_target = np.zeros(TARGET_SHAPE, dtype=np.float32)
             padded_target[:target_data.shape[0], :target_data.shape[1]] = target_data
             target_data = padded_target

        X_coarse[i] = predictor_data
        Y_high_res[i] = target_data

    print("Interpolating coarse predictors...")
    X_interp = np.zeros((num_samples, 5, *TARGET_SHAPE), dtype=np.float32)
    zoom_factors = (TARGET_SHAPE[0] / coarse_shape[0], TARGET_SHAPE[1] / coarse_shape[1])
    for i in tqdm(range(num_samples), desc="Interpolating samples"):
        for c in range(5):
            X_interp[i, c] = zoom(X_coarse[i, c], zoom_factors, order=3)

    X_features = extract_neighborhood_features(X_interp)
    return X_features, Y_high_res

# --- METRIC CALCULATION FUNCTIONS ---

def denormalize(data, stats):
    return data * (stats['target_std'] + 1e-8) + stats['target_mean']

def calculate_csi(pred, target, threshold=220.0):
    pred_event, target_event = (pred <= threshold), (target <= threshold)
    hits = np.sum(pred_event & target_event)
    misses = np.sum(~pred_event & target_event)
    false_alarms = np.sum(pred_event & ~target_event)
    return hits / (hits + misses + false_alarms) if (hits + misses + false_alarms) > 0 else 0.0

def calculate_fss(pred, target, threshold=220.0, window_size=11):
    pred_binary, target_binary = (pred <= threshold).astype(float), (target <= threshold).astype(float)
    pred_fractions = uniform_filter(pred_binary, size=window_size)
    target_fractions = uniform_filter(target_binary, size=window_size)
    mse_fractions = np.mean((pred_fractions - target_fractions) ** 2)
    mse_fractions_ref = np.mean(pred_fractions ** 2) + np.mean(target_fractions ** 2)
    return 1 - (mse_fractions / mse_fractions_ref) if mse_fractions_ref > 0 else 1.0

# --- MAIN EVALUATION SCRIPT ---

def run_evaluation():
    """
    Loads the pre-trained RF model, runs it on the test set, and calculates
    the full suite of summary statistics.
    """
    # Prepare test data
    X_test_features, Y_test_high_res_norm = load_and_prepare_data('test')

    # Load the pre-trained Random Forest model
    model_path = MODEL_DIR / 'strong_baseline_rf.joblib'
    if not model_path.exists():
        print(f"FATAL: Trained RF model not found at {model_path}")
        print("Please run the training script first.")
        return

    print(f"\nLoading pre-trained RF model from {model_path}...")
    rf_model = joblib.load(model_path)

    # Generate predictions
    print("Generating predictions with RF model...")
    Y_pred_flat = rf_model.predict(X_test_features)
    Y_pred_norm = Y_pred_flat.reshape(Y_test_high_res_norm.shape)

    # Calculate metrics for each sample
    print("\nCalculating metrics for each sample in the test set...")
    all_metrics = []
    for i in tqdm(range(len(Y_pred_norm)), desc="Evaluating samples"):
        pred_dn = denormalize(Y_pred_norm[i], STATS)
        true_dn = denormalize(Y_test_high_res_norm[i], STATS)

        data_range = true_dn.max() - true_dn.min()

        metrics = {
            'rmse': np.sqrt(mean_squared_error(true_dn, pred_dn)),
            'mae': mean_absolute_error(true_dn, pred_dn),
            'r2': r2_score(true_dn, pred_dn),
            'correlation': np.corrcoef(true_dn.flatten(), pred_dn.flatten())[0, 1],
            'ssim': ssim(true_dn, pred_dn, data_range=data_range, win_size=7) if data_range > 0 else 1.0,
            'csi': calculate_csi(pred_dn, true_dn),
            'fss': calculate_fss(pred_dn, true_dn)
        }
        all_metrics.append(metrics)

    # Compute final summary statistics in the requested format
    print("\nComputing final summary statistics...")
    metrics_df = pd.DataFrame(all_metrics)

    final_results = {
        'name': 'strong_rf_baseline',
        'category': 'operational_baseline',
        'description': 'Random Forest with Spatial Neighborhood Features',
        'status': 'success',
        'input_channels': 5
    }

    for metric in metrics_df.columns:
        stats = metrics_df[metric].describe()
        final_results[f'{metric}_mean'] = stats['mean']
        final_results[f'{metric}_std'] = stats['std']
        final_results[f'{metric}_median'] = stats['50%']
        final_results[f'{metric}_min'] = stats['min']
        final_results[f'{metric}_max'] = stats['max']
        final_results[f'{metric}_count'] = int(stats['count'])

    final_df = pd.DataFrame([final_results])

    # 6. Display and save results
    print("\n--- FINAL STRONG RF BASELINE RESULTS ---")
    for col in final_df.columns:
        print(f"{col:<20}: {final_df[col].iloc[0]}")

    output_csv_path = OUTPUT_DIR / 'strong_rf_final_detailed_results.csv'
    final_df.to_csv(output_csv_path, index=False)
    print(f"\n✅ Results saved to {output_csv_path}")

if __name__ == "__main__":
    run_evaluation()

Mounted at /content/drive

--- Preparing 'test' data ---


Loading test files: 100%|██████████| 150/150 [00:04<00:00, 30.49it/s]


Interpolating coarse predictors...


Interpolating samples: 100%|██████████| 150/150 [00:05<00:00, 28.08it/s]


Extracting spatial neighborhood features...


Calculating neighborhood stats: 100%|██████████| 5/5 [00:03<00:00,  1.47it/s]



Loading pre-trained RF model from /content/drive/My Drive/AR_Downscaling/publication_experiments/strong_baseline_rf/strong_baseline_rf.joblib...
Generating predictions with RF model...


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    8.0s finished



Calculating metrics for each sample in the test set...


Evaluating samples: 100%|██████████| 150/150 [00:01<00:00, 117.93it/s]



Computing final summary statistics...

--- FINAL STRONG RF BASELINE RESULTS ---
name                : strong_rf_baseline
category            : operational_baseline
description         : Random Forest with Spatial Neighborhood Features
status              : success
input_channels      : 5
rmse_mean           : 365.6403597660159
rmse_std            : 15.540257149241755
rmse_median         : 363.37960629116475
rmse_min            : 337.79086138189666
rmse_max            : 425.8484042652887
rmse_count          : 150
mae_mean            : 147.4556671229599
mae_std             : 39.619337290937516
mae_median          : 139.56944490757704
mae_min             : 82.1775694017185
mae_max             : 282.2787620823265
mae_count           : 150
r2_mean             : -35.10850304799859
r2_std              : 105.74245078799508
r2_median           : -3.5071413276293097
r2_min              : -964.1057640718251
r2_max              : -0.05446853222253725
r2_count            : 150
correlation_mean    

In [None]:
# --- STANDALONE EVALUATION SCRIPT FOR MOS BASELINE ---
# This script loads a pre-trained MOS model and evaluates it on the test set,
# calculating a comprehensive suite of metrics and summary statistics.

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import joblib
from scipy.ndimage import zoom, uniform_filter
import warnings

# --- Imports for Evaluation ---
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skimage.metrics import structural_similarity as ssim

warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    PROJECT_PATH = Path('/content/drive/My Drive/AR_Downscaling')
except:
    PROJECT_PATH = Path('.') # For local execution

DATA_DIR = PROJECT_PATH / 'final_dataset_multi_variable'
MODEL_DIR = PROJECT_PATH / 'publication_experiments' / 'operational_baseline_mos'
OUTPUT_DIR = MODEL_DIR

# Define the target high resolution
TARGET_SHAPE = (256, 256)
# Load normalization stats to denormalize for physical thresholds
STATS = joblib.load(DATA_DIR / 'normalization_stats_multi_variable.joblib')

# --- DATA LOADING AND PREPARATION FUNCTIONS ---

def load_data(split='test'):
    """Loads and prepares data, cropping to a uniform size."""
    print(f"Loading '{split}' data...")
    split_dir = DATA_DIR / split
    predictor_files = sorted(list(split_dir.glob('*_predictor.npy')))
    num_samples = len(predictor_files)
    coarse_shape = np.load(predictor_files[0]).shape[1:]

    X_coarse = np.zeros((num_samples, 5, *coarse_shape), dtype=np.float32)
    Y_high_res = np.zeros((num_samples, *TARGET_SHAPE), dtype=np.float32)

    for i, pred_path in enumerate(tqdm(predictor_files, desc=f"Loading {split} files")):
        targ_path = Path(str(pred_path).replace('_predictor.npy', '_target.npy'))
        predictor_data = np.load(pred_path)
        target_data = np.load(targ_path)

        h, w = target_data.shape
        th, tw = TARGET_SHAPE
        if h != th or w != tw:
            start_h = max(0, (h - th) // 2)
            start_w = max(0, (w - tw) // 2)
            target_data = target_data[start_h : start_h + th, start_w : start_w + tw]

        if target_data.shape != TARGET_SHAPE:
             padded_target = np.zeros(TARGET_SHAPE, dtype=np.float32)
             padded_target[:target_data.shape[0], :target_data.shape[1]] = target_data
             target_data = padded_target

        X_coarse[i] = predictor_data
        Y_high_res[i] = target_data

    return X_coarse, Y_high_res

def interpolate_predictors(X_coarse, target_shape):
    """Upscales coarse predictor variables to the high-resolution grid."""
    print("Interpolating predictors to high resolution...")
    num_samples, num_channels, _, _ = X_coarse.shape
    X_high_res = np.zeros((num_samples, num_channels, *target_shape), dtype=np.float32)
    zoom_factors = (1, target_shape[0] / X_coarse.shape[2], target_shape[1] / X_coarse.shape[3])

    for i in tqdm(range(num_samples), desc="Interpolating samples"):
        for c in range(num_channels):
            X_high_res[i, c] = zoom(X_coarse[i, c], zoom_factors[1:], order=3)

    return X_high_res

def predict_with_mos(mos_model_grid, X_high_res_test):
    """Generates predictions using the trained MOS model grid."""
    print("Generating predictions with MOS model...")
    num_samples, num_channels, h, w = X_high_res_test.shape
    X_test_reshaped = X_high_res_test.transpose(2, 3, 0, 1).reshape(h * w, num_samples, num_channels)
    Y_pred_reshaped = np.zeros((h * w, num_samples), dtype=np.float32)

    for i in tqdm(range(h * w), desc="Predicting pixel-wise"):
        row, col = i // w, i % w
        model = mos_model_grid[row, col]
        Y_pred_reshaped[i] = model.predict(X_test_reshaped[i])

    Y_pred = Y_pred_reshaped.reshape(h, w, num_samples).transpose(2, 0, 1)
    return Y_pred

def denormalize(data, stats):
    """Denormalizes data back to its physical scale."""
    return data * (stats['target_std'] + 1e-8) + stats['target_mean']

def calculate_csi(pred, target, threshold=220.0):
    """Calculates the Critical Success Index for an event threshold."""
    pred_event = pred <= threshold
    target_event = target <= threshold
    hits = np.sum(pred_event & target_event)
    misses = np.sum(~pred_event & target_event)
    false_alarms = np.sum(pred_event & ~target_event)
    return hits / (hits + misses + false_alarms) if (hits + misses + false_alarms) > 0 else 0.0

def calculate_fss(pred, target, threshold=220.0, window_size=11):
    """Calculates the Fractions Skill Score."""
    pred_binary = (pred <= threshold).astype(float)
    target_binary = (target <= threshold).astype(float)
    pred_fractions = uniform_filter(pred_binary, size=window_size)
    target_fractions = uniform_filter(target_binary, size=window_size)
    mse_fractions = np.mean((pred_fractions - target_fractions) ** 2)
    mse_fractions_ref = np.mean(pred_fractions ** 2) + np.mean(target_fractions ** 2)
    return 1 - (mse_fractions / mse_fractions_ref) if mse_fractions_ref > 0 else 1.0


# --- MAIN EVALUATION FUNCTION ---

def run_evaluation():
    """
    Loads the pre-trained MOS model, runs it on the test set, and calculates
    the full suite of summary statistics.
    """
    # Load test data
    X_test_coarse, Y_test_high_res_norm = load_data('test')

    # Interpolate predictors
    X_test_interp = interpolate_predictors(X_test_coarse, TARGET_SHAPE)

    # Load the pre-trained MOS model
    model_path = MODEL_DIR / 'mos_model_grid.joblib'
    if not model_path.exists():
        print(f"FATAL: Trained model not found at {model_path}")
        print("Please run the training script first.")
        return

    print(f"Loading pre-trained MOS model from {model_path}...")
    mos_model_grid = joblib.load(model_path)

    # Generate predictions
    Y_pred_norm = predict_with_mos(mos_model_grid, X_test_interp)

    # Calculate metrics for each sample
    print("Calculating metrics for each sample in the test set...")
    all_metrics = []
    for i in tqdm(range(len(Y_pred_norm)), desc="Evaluating samples"):
        pred_dn = denormalize(Y_pred_norm[i], STATS)
        true_dn = denormalize(Y_test_high_res_norm[i], STATS)

        data_range = true_dn.max() - true_dn.min()

        metrics = {
            'rmse': np.sqrt(mean_squared_error(true_dn, pred_dn)),
            'mae': mean_absolute_error(true_dn, pred_dn),
            'r2': r2_score(true_dn, pred_dn),
            'correlation': np.corrcoef(true_dn.flatten(), pred_dn.flatten())[0, 1],
            'ssim': ssim(true_dn, pred_dn, data_range=data_range, win_size=7) if data_range > 0 else 1.0,
            'csi': calculate_csi(pred_dn, true_dn),
            'fss': calculate_fss(pred_dn, true_dn)
        }
        all_metrics.append(metrics)

    # Compute final summary statistics and format the output
    print("Computing final summary statistics...")
    metrics_df = pd.DataFrame(all_metrics)

    final_results = {
        'name': 'mos_baseline',
        'category': 'operational_baseline',
        'description': 'Model Output Statistics (Pixel-wise Linear Regression)',
        'status': 'success',
        'input_channels': 5
    }

    for metric in metrics_df.columns:
        stats = metrics_df[metric].describe()
        final_results[f'{metric}_mean'] = stats['mean']
        final_results[f'{metric}_std'] = stats['std']
        final_results[f'{metric}_median'] = stats['50%']
        final_results[f'{metric}_min'] = stats['min']
        final_results[f'{metric}_max'] = stats['max']
        final_results[f'{metric}_count'] = int(stats['count'])

    final_df = pd.DataFrame([final_results])

    # Display and save results
    print("\n--- FINAL MOS BASELINE RESULTS ---")
    # Print in a more readable format
    for col in final_df.columns:
        print(f"{col:<20}: {final_df[col].iloc[0]}")

    output_csv_path = OUTPUT_DIR / 'mos_final_detailed_results.csv'
    final_df.to_csv(output_csv_path, index=False)
    print(f"\n✅ Results saved to {output_csv_path}")

if __name__ == "__main__":
    run_evaluation()

Mounted at /content/drive
Loading 'test' data...


Loading test files: 100%|██████████| 150/150 [00:01<00:00, 107.87it/s]


Interpolating predictors to high resolution...


Interpolating samples: 100%|██████████| 150/150 [00:05<00:00, 25.91it/s]


Loading pre-trained MOS model from /content/drive/My Drive/AR_Downscaling/publication_experiments/operational_baseline_mos/mos_model_grid.joblib...
Generating predictions with MOS model...


Predicting pixel-wise: 100%|██████████| 65536/65536 [00:07<00:00, 8991.57it/s]


Calculating metrics for each sample in the test set...


Evaluating samples: 100%|██████████| 150/150 [00:01<00:00, 113.83it/s]


Computing final summary statistics...

--- FINAL MOS BASELINE RESULTS ---
name                : mos_baseline
category            : operational_baseline
description         : Model Output Statistics (Pixel-wise Linear Regression)
status              : success
input_channels      : 5
rmse_mean           : 94.04356138872107
rmse_std            : 33.55237571799318
rmse_median         : 88.28466658731905
rmse_min            : 26.42124666609479
rmse_max            : 194.78690279379668
rmse_count          : 150
mae_mean            : 79.12677005767823
mae_std             : 31.31413909865684
mae_median          : 72.21223831176758
mae_min             : 20.08230209350586
mae_max             : 182.78323364257812
mae_count           : 150
r2_mean             : -16.378360295395055
r2_std              : 56.833652539966046
r2_median           : -1.671580970287323
r2_min              : -549.1316528320312
r2_max              : 0.1081676185131073
r2_count            : 150
correlation_mean    : 0.9785047