In [None]:
#!/usr/bin/env python3
"""
Fixed Support Vector Regression Baseline Implementation
Uses pixel-wise prediction approach for spatial field reconstruction
"""

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import joblib
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from skimage.metrics import structural_similarity as ssim
from scipy.ndimage import uniform_filter
import warnings

warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    PROJECT_PATH = Path('/content/drive/My Drive/AR_Downscaling')
except:
    PROJECT_PATH = Path('.')

DATA_DIR = PROJECT_PATH / 'final_dataset_multi_variable'
OUTPUT_DIR = PROJECT_PATH / 'publication_experiments' / 'svr_baseline'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def load_sample_data(n_samples=None):
    """Load data for SVR training (pixel-wise approach)."""

    # Load training data
    train_dir = DATA_DIR / 'train'
    predictor_files = sorted(list(train_dir.glob('*_predictor.npy')))

    if n_samples is not None:
        predictor_files = predictor_files[:n_samples]

    print(f"📂 Loading {len(predictor_files)} samples for pixel-wise SVR training...")

    X_data = []
    y_data = []

    for pred_file in tqdm(predictor_files, desc="Loading training samples"):
        # Load predictor and target
        pred_data = np.load(pred_file)  # Shape: (5, H, W)
        target_file = Path(str(pred_file).replace('_predictor.npy', '_target.npy'))
        target_data = np.load(target_file)  # Shape: (H, W)

        # Reshape for pixel-wise training
        H, W = target_data.shape

        # For each pixel, create features from predictor variables
        pred_reshaped = pred_data.reshape(5, -1).T  # Shape: (H*W, 5)
        target_reshaped = target_data.reshape(-1)   # Shape: (H*W,)

        # Subsample pixels for efficiency (every 5th pixel for better coverage)
        pixel_indices = np.arange(0, len(target_reshaped), 5)  # Every 5th pixel instead of 10th

        X_data.append(pred_reshaped[pixel_indices])
        y_data.append(target_reshaped[pixel_indices])

    X_combined = np.vstack(X_data)
    y_combined = np.hstack(y_data)

    print(f"Combined data shape: X={X_combined.shape}, y={y_combined.shape}")
    return X_combined, y_combined

def train_svr_model():
    """Train SVR model with pixel-wise approach."""

    print("🎯 Training Support Vector Regression Baseline (Pixel-wise)")
    print("=" * 60)

    # Load training data - USE ALL 1200 SAMPLES
    X_train, y_train = load_sample_data(n_samples=None)  # Use ALL samples

    # Subsample pixels for SVR efficiency, but keep more data
    n_train = min(100000, len(X_train))  # Increased from 20k to 100k pixels
    indices = np.random.choice(len(X_train), n_train, replace=False)
    X_train_sub = X_train[indices]
    y_train_sub = y_train[indices]

    print(f"Training SVR on {len(X_train_sub)} pixel samples from ALL 1200 training images...")

    # Simplified hyperparameter grid for faster training
    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'epsilon': [0.1, 0.2],
        'gamma': ['scale', 0.01]
    }

    # Train SVR
    print("🔍 Training SVR with grid search...")
    svr = SVR(kernel='rbf', max_iter=1000)

    grid_search = GridSearchCV(
        svr,
        param_grid,
        cv=3,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_sub, y_train_sub)

    print(f"✅ Best parameters: {grid_search.best_params_}")
    print(f"✅ Best CV score: {-grid_search.best_score_:.4f}")

    # Save the model
    model_path = OUTPUT_DIR / 'svr_baseline.joblib'
    joblib.dump(grid_search.best_estimator_, model_path)

    # Load stats
    stats_file = DATA_DIR / 'normalization_stats_multi_variable.joblib'
    stats = joblib.load(stats_file)

    print(f"💾 Model saved to {model_path}")

    return grid_search.best_estimator_, stats

def predict_spatial_field(svr_model, predictor_data):
    """Predict full spatial field using trained SVR model."""

    # predictor_data shape: (5, H, W)
    n_vars, H, W = predictor_data.shape

    # Reshape for pixel-wise prediction
    pred_reshaped = predictor_data.reshape(n_vars, -1).T  # Shape: (H*W, 5)

    # Predict each pixel
    predictions = svr_model.predict(pred_reshaped)  # Shape: (H*W,)

    # Reshape back to spatial field
    predicted_field = predictions.reshape(H, W)

    return predicted_field

def calculate_csi(pred, target, threshold=220.0):
    """Calculate Critical Success Index."""
    pred_event = pred <= threshold
    target_event = target <= threshold
    hits = np.sum(pred_event & target_event)
    misses = np.sum(~pred_event & target_event)
    false_alarms = np.sum(pred_event & ~target_event)
    return hits / (hits + misses + false_alarms) if (hits + misses + false_alarms) > 0 else 0.0

def calculate_fss(pred, target, threshold=220.0, window_size=11):
    """Calculate Fractions Skill Score."""
    pred_binary = (pred <= threshold).astype(float)
    target_binary = (target <= threshold).astype(float)
    pred_fractions = uniform_filter(pred_binary, size=window_size)
    target_fractions = uniform_filter(target_binary, size=window_size)
    mse_fractions = np.mean((pred_fractions - target_fractions) ** 2)
    mse_fractions_ref = np.mean(pred_fractions ** 2) + np.mean(target_fractions ** 2)
    return 1 - (mse_fractions / mse_fractions_ref) if mse_fractions_ref > 0 else 1.0

def denormalize(data, stats):
    """Denormalize data using provided statistics."""
    return data * (stats.get('target_std', 1.0) + 1e-8) + stats.get('target_mean', 0.0)

def evaluate_svr_model():
    """Evaluate SVR model on test set."""

    print("\n📊 Evaluating SVR on test set...")

    # Load model and stats
    model_path = OUTPUT_DIR / 'svr_baseline.joblib'
    if not model_path.exists():
        print("❌ SVR model not found. Please train first.")
        return

    svr_model = joblib.load(model_path)
    stats_file = DATA_DIR / 'normalization_stats_multi_variable.joblib'
    stats = joblib.load(stats_file)

    # Load test data
    test_dir = DATA_DIR / 'test'
    predictor_files = sorted(list(test_dir.glob('*_predictor.npy')))

    print(f"Evaluating on {len(predictor_files)} test samples...")

    all_metrics = []

    for pred_file in tqdm(predictor_files, desc="Evaluating samples"):
        try:
            # Load data
            pred_data = np.load(pred_file)  # Shape: (5, H, W)
            target_file = Path(str(pred_file).replace('_predictor.npy', '_target.npy'))
            target_data = np.load(target_file)  # Shape: (H, W)

            # Predict spatial field
            predicted_field = predict_spatial_field(svr_model, pred_data)

            # Denormalize for physical metrics
            pred_dn = denormalize(predicted_field, stats)
            target_dn = denormalize(target_data, stats)

            # Calculate metrics
            data_range = target_dn.max() - target_dn.min()

            # Handle edge case where correlation might fail
            try:
                correlation = np.corrcoef(target_dn.flatten(), pred_dn.flatten())[0, 1]
                if np.isnan(correlation):
                    correlation = 0.0
            except:
                correlation = 0.0

            metrics = {
                'rmse': np.sqrt(mean_squared_error(target_dn.flatten(), pred_dn.flatten())),
                'mae': mean_absolute_error(target_dn.flatten(), pred_dn.flatten()),
                'r2': r2_score(target_dn.flatten(), pred_dn.flatten()),
                'correlation': correlation,
                'ssim': ssim(target_dn, pred_dn, data_range=data_range, win_size=7) if data_range > 0 else 1.0,
                'csi': calculate_csi(pred_dn, target_dn),
                'fss': calculate_fss(pred_dn, target_dn)
            }

            for temp in [210, 220, 230]:
                metrics[f'csi_{temp}'] = calculate_csi(pred_dn, target_dn, threshold=temp)


            all_metrics.append(metrics)

        except Exception as e:
            print(f"⚠️ Error processing {pred_file.name}: {str(e)}")
            continue

    if not all_metrics:
        print("❌ No successful evaluations completed")
        return None

    # Compute summary statistics
    print(f"\n📈 Computing summary statistics from {len(all_metrics)} successful evaluations...")
    metrics_df = pd.DataFrame(all_metrics)

    final_results = {
        'name': 'svr_baseline',
        'category': 'traditional_ml',
        'description': 'Support Vector Regression with RBF Kernel',
        'status': 'success',
        'input_channels': 5
    }

    for metric in metrics_df.columns:
        stats_summary = metrics_df[metric].describe()
        final_results[f'{metric}_mean'] = stats_summary['mean']
        final_results[f'{metric}_std'] = stats_summary['std']
        final_results[f'{metric}_median'] = stats_summary['50%']
        final_results[f'{metric}_min'] = stats_summary['min']
        final_results[f'{metric}_max'] = stats_summary['max']
        final_results[f'{metric}_count'] = int(stats_summary['count'])

    # Save results
    final_df = pd.DataFrame([final_results])
    results_path = OUTPUT_DIR / 'svr_detailed_results.csv'
    final_df.to_csv(results_path, index=False)

    # Display results
    print("\n--- FINAL SVR BASELINE RESULTS ---")
    for col in final_df.columns:
        print(f"{col:<20}: {final_df[col].iloc[0]}")

    print(f"\n✅ Results saved to {results_path}")

    return final_results

def run_complete_svr_experiment():
    """Run complete SVR baseline experiment."""

    print("🚀 SUPPORT VECTOR REGRESSION BASELINE EXPERIMENT")
    print("=" * 70)

    # Train model
    try:
        svr_model, stats = train_svr_model()
        print("✅ SVR training completed successfully!")
    except Exception as e:
        print(f"❌ SVR training failed: {str(e)}")
        return None

    # Evaluate model
    try:
        results = evaluate_svr_model()
        if results:
            print("✅ SVR evaluation completed successfully!")
        else:
            print("⚠️ SVR evaluation completed with issues")
    except Exception as e:
        print(f"❌ SVR evaluation failed: {str(e)}")
        return None

    print("\n" + "=" * 70)
    print("🎉 SVR BASELINE EXPERIMENT COMPLETED!")
    print("=" * 70)

    if results:
        print(f"\n📊 KEY RESULTS:")
        print(f"  SSIM: {results['ssim_mean']:.3f} ± {results['ssim_std']:.3f}")
        print(f"  Correlation: {results['correlation_mean']:.3f} ± {results['correlation_std']:.3f}")
        print(f"  CSI: {results['csi_mean']:.3f} ± {results['csi_std']:.3f}")
        print(f"  FSS: {results['fss_mean']:.3f} ± {results['fss_std']:.3f}")

        # Check if results match expected values from paper
        expected_ssim = 0.887
        expected_corr = 0.921
        expected_csi = 0.012

        print(f"\n🔍 VALIDATION AGAINST PAPER CLAIMS:")
        print(f"  SSIM: Expected {expected_ssim:.3f}, Got {results['ssim_mean']:.3f}")
        print(f"  Correlation: Expected {expected_corr:.3f}, Got {results['correlation_mean']:.3f}")
        print(f"  CSI: Expected {expected_csi:.3f}, Got {results['csi_mean']:.3f}")

    print(f"\n📁 All outputs saved to: {OUTPUT_DIR}")

    return results

if __name__ == "__main__":
    # Run the complete SVR experiment
    results = run_complete_svr_experiment()

    if results:
        print("\n🎊 SVR BASELINE READY FOR COMPREHENSIVE EVALUATION!")
    else:
        print("\n⚠️ SVR experiment completed with issues. Check logs above.")

Mounted at /content/drive
🚀 SUPPORT VECTOR REGRESSION BASELINE EXPERIMENT
🎯 Training Support Vector Regression Baseline (Pixel-wise)
📂 Loading 1200 samples for pixel-wise SVR training...


Loading training samples: 100%|██████████| 1200/1200 [04:43<00:00,  4.24it/s]


Combined data shape: X=(21145200, 5), y=(21145200,)
Training SVR on 100000 pixel samples from ALL 1200 training images...
🔍 Training SVR with grid search...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
✅ Best parameters: {'C': 1.0, 'epsilon': 0.2, 'gamma': 0.01}
✅ Best CV score: 158.1168
💾 Model saved to /content/drive/My Drive/AR_Downscaling/publication_experiments/svr_baseline/svr_baseline.joblib
✅ SVR training completed successfully!

📊 Evaluating SVR on test set...
Evaluating on 150 test samples...


Evaluating samples: 100%|██████████| 150/150 [18:47<00:00,  7.52s/it]



📈 Computing summary statistics from 150 successful evaluations...

--- FINAL SVR BASELINE RESULTS ---
name                : svr_baseline
category            : traditional_ml
description         : Support Vector Regression with RBF Kernel
status              : success
input_channels      : 5
rmse_mean           : 128.3721335476796
rmse_std            : 48.10959965138197
rmse_median         : 114.99524073242266
rmse_min            : 32.32572389960285
rmse_max            : 262.47591615840906
rmse_count          : 150
mae_mean            : 116.58302261899223
mae_std             : 51.105274346056056
mae_median          : 102.54400018602169
mae_min             : 26.403133483944593
mae_max             : 261.9155447538069
mae_count           : 150
r2_mean             : -19.577696883728734
r2_std              : 61.85530263241797
r2_median           : -0.7232468872234958
r2_min              : -550.2601274040269
r2_max              : 0.02154379543104068
r2_count            : 150
correlation_mean