In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, make_scorer
from scipy.stats import loguniform
import re

# ---------------------------------------------------
# 1. Load Data
# ---------------------------------------------------
labeled_path = "https://drive.google.com/uc?id=15WUl8ilQyE2MRXjnO-V2x7rJXNMxI7qI"
unlabeled_path = "https://drive.google.com/uc?id=1J57A0_EdtHW4yZU1Kus6ymekxM4DzZrp"

df_labeled = (
    pd.read_excel(labeled_path, engine="openpyxl")
      .rename(columns={"Unnamed: 0": "index"})
      .dropna()
)
df_unlabeled = (
    pd.read_excel(unlabeled_path, engine="openpyxl")
      .rename(columns={"Unnamed: 0": "index"})
)

index_col, target_col = "index", "label"
X = df_labeled.drop(columns=[index_col, target_col])
y = df_labeled[target_col]
X_unlabeled = df_unlabeled.drop(columns=[index_col])

print("Data shape:", X.shape)
print("Class distribution:", np.bincount(y))

# ---------------------------------------------------
# 2. Preprocessing
# ---------------------------------------------------
cat_cols = ["x2", "x3", "x4"]
num_cols = ["x15", "x16", "x17", "x18", "x19", "x20", "x21"]
bin_cols = [c for c in X.columns if c not in cat_cols + num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
        ("bin", "passthrough", bin_cols),
    ]
)

# ---------------------------------------------------
# 3. Balanced Error Rate
# ---------------------------------------------------
def balanced_error_rate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    with np.errstate(divide="ignore", invalid="ignore"):
        per_class_error = 1.0 - np.diag(cm) / cm.sum(axis=1)
    return np.nanmean(per_class_error)

ber_scorer = make_scorer(balanced_error_rate, greater_is_better=False)

# ---------------------------------------------------
# 4. PROPER NESTED CROSS-VALIDATION
# ---------------------------------------------------
print("\n" + "="*60)
print("PROPER NESTED CROSS-VALIDATION")
print("="*60)

# Define pipeline
rfe_svm_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("selector", RFE(
        estimator=SVC(kernel="linear", class_weight="balanced", random_state=42),
        step=1
    )),
    ("clf", SVC(
        kernel="rbf",
        class_weight="balanced",
        probability=True,
        cache_size=1000,
        random_state=42
    ))
])

param_dist = {
    "selector__n_features_to_select": [8, 10, 12, 15, 18],
    "clf__C": loguniform(1e0, 1e3),
    "clf__gamma": loguniform(1e-4, 1e-1),
}

# OUTER CV: For final evaluation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store results
outer_scores = []
outer_probas = []
outer_best_params = []

print("Running 5-fold outer CV (each fold has inner 4-fold CV for hyperparam tuning)...")

for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), 1):
    print(f"\n--- Outer Fold {fold}/5 ---")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # INNER CV: Hyperparameter tuning on training fold
    inner_search = RandomizedSearchCV(
        rfe_svm_pipeline,
        param_distributions=param_dist,
        n_iter=10,
        scoring=ber_scorer,
        cv=4,  # Inner CV
        n_jobs=-1,
        verbose=0,
        random_state=42
    )

    inner_search.fit(X_train, y_train)

    # Get best model from inner search
    best_inner_model = inner_search.best_estimator_
    outer_best_params.append(inner_search.best_params_)

    # Evaluate on outer test fold
    y_pred = best_inner_model.predict(X_test)
    fold_score = balanced_error_rate(y_test, y_pred)
    outer_scores.append(fold_score)

    # Get probabilities for threshold tuning later
    if hasattr(best_inner_model, 'predict_proba'):
        y_proba = best_inner_model.predict_proba(X_test)[:, 1]
    else:
        y_proba = y_pred.astype(float)

    # Store for overall threshold optimization
    for i, idx in enumerate(test_idx):
        if idx < len(outer_probas):
            outer_probas[idx] = y_proba[i]
        else:
            # Extend list if needed
            outer_probas.extend([np.nan] * (idx - len(outer_probas) + 1))
            outer_probas[idx] = y_proba[i]

# Convert scores to regular floats for better printing
outer_scores_clean = [float(score) for score in outer_scores]
print(f"\nOuter CV scores: {outer_scores_clean}")
print(f"Mean Outer CV BER: {np.mean(outer_scores_clean):.4f}")
print(f"Std Outer CV BER: {np.std(outer_scores_clean):.4f}")

# Convert outer_probas to array (handle any missing)
outer_probas_array = np.array([p for p in outer_probas if not np.isnan(p)])

# ---------------------------------------------------
# 5. THRESHOLD OPTIMIZATION ON OUTER CV PROBABILITIES
# ---------------------------------------------------
print("\n" + "="*60)
print("THRESHOLD OPTIMIZATION ON OUTER CV PROBABILITIES")
print("="*60)

thresholds = np.linspace(0.1, 0.9, 81)
best_ber_nested = 1.0
best_thresh_nested = 0.5

for t in thresholds:
    preds = (outer_probas_array >= t).astype(int)
    ber = balanced_error_rate(y[:len(outer_probas_array)], preds)
    if ber < best_ber_nested:
        best_ber_nested = ber
        best_thresh_nested = t

print(f"Optimal threshold from NESTED CV: {best_thresh_nested:.3f}")
print(f"BER at optimal threshold: {best_ber_nested:.4f}")

# ---------------------------------------------------
# 6. FINAL TRAINING WITH BEST HYPERPARAMETERS
# ---------------------------------------------------
print("\n" + "="*60)
print("FINAL MODEL TRAINING")
print("="*60)

# FIXED: Better approach to find most common parameters
# Create a dictionary to track frequency of each parameter combination
from collections import defaultdict

# We'll round the float parameters to avoid precision issues
param_freq = defaultdict(int)

for params in outer_best_params:
    # Extract and convert parameters
    n_features = int(params.get('selector__n_features_to_select', 15))
    C_val = float(params.get('clf__C', 10.0))
    gamma_val = float(params.get('clf__gamma', 0.01))

    # Round to reasonable precision for comparison
    C_rounded = round(C_val, 2)
    gamma_rounded = round(gamma_val, 6)

    # Create a tuple key
    key = (n_features, C_rounded, gamma_rounded)
    param_freq[key] += 1

# Find the most common parameter combination
most_common_key = max(param_freq.items(), key=lambda x: x[1])[0]
n_features_common, C_common, gamma_common = most_common_key

print(f"Most common parameter combination (across {len(outer_best_params)} folds):")
print(f"  n_features_to_select: {n_features_common}")
print(f"  C: {C_common}")
print(f"  gamma: {gamma_common}")
print(f"  Frequency: {param_freq[most_common_key]}/{len(outer_best_params)}")

# Alternatively, use median values for continuous parameters
n_features_values = [int(p.get('selector__n_features_to_select', 15)) for p in outer_best_params]
C_values = [float(p.get('clf__C', 10.0)) for p in outer_best_params]
gamma_values = [float(p.get('clf__gamma', 0.01)) for p in outer_best_params]

n_features_final = int(np.median(n_features_values))
C_final = float(np.median(C_values))
gamma_final = float(np.median(gamma_values))

print(f"\nMedian parameter values (alternative approach):")
print(f"  n_features_to_select: {n_features_final}")
print(f"  C: {C_final:.2f}")
print(f"  gamma: {gamma_final:.6f}")

# Choose which approach to use (here we'll use median)
print(f"\nUsing MEDIAN parameter values for final model...")

# Train final model on ALL data with these params
final_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("selector", RFE(
        estimator=SVC(kernel="linear", class_weight="balanced", random_state=42),
        n_features_to_select=n_features_final,
        step=1
    )),
    ("clf", SVC(
        kernel="rbf",
        class_weight="balanced",
        probability=True,
        cache_size=1000,
        random_state=42,
        C=C_final,
        gamma=gamma_final
    ))
])

print("Training final model on all labeled data...")
final_pipeline.fit(X, y)

# ---------------------------------------------------
# 7. MAKE PREDICTIONS
# ---------------------------------------------------
print("Making predictions on unlabeled data...")
final_proba = final_pipeline.predict_proba(X_unlabeled)[:, 1]
final_preds = (final_proba >= best_thresh_nested).astype(int)

print(f"\nPrediction distribution:")
print(f"  Class 0: {sum(final_preds == 0)} ({sum(final_preds == 0)/len(final_preds)*100:.1f}%)")
print(f"  Class 1: {sum(final_preds == 1)} ({sum(final_preds == 1)/len(final_preds)*100:.1f}%)")

# Save predictions
filename = "ProjectPredictions2025_SVM_RFE_Nested_CV.csv"
pred_df = pd.DataFrame({
    index_col: df_unlabeled[index_col],
    target_col: final_preds
})
pred_df.to_csv(filename, index=False)

print(f"\nSaved Nested CV predictions to {filename}")

# ---------------------------------------------------
# 8. COMPARE WITH YOUR ORIGINAL RESULTS
# ---------------------------------------------------
print("\n" + "="*60)
print("COMPARISON WITH PREVIOUS RESULTS")
print("="*60)
print(f"Your original validation BER:           0.3330 (optimistic)")
print(f"Your original CV BER (threshold 0.5):  0.3459")
print(f"Your original CV BER (threshold 0.28): 0.3454")
print(f"Nested CV BER (proper evaluation):      {best_ber_nested:.4f}")

if best_ber_nested < 0.3454:
    print(f"\n✓ Nested CV shows TRUE performance: {best_ber_nested:.4f}")
    print("  Submit Nested CV predictions")
elif abs(best_ber_nested - 0.3454) < 0.01:
    print(f"\n≈ Similar results")
    print("  Either predictions are fine")
else:
    print(f"\n⚠ Nested CV gives different result")
    print(f"  Consider which evaluation is more trustworthy")

print("="*60)

# ---------------------------------------------------
# 9. ADDITIONAL MODEL INFO (for report)
# ---------------------------------------------------
print("\n" + "="*60)
print("MODEL INFORMATION FOR REPORT")
print("="*60)
print(f"Model: SVM with RBF kernel and RFE feature selection")
print(f"Preprocessing:")
print(f"  - Categorical columns (x2, x3, x4): OneHotEncoder")
print(f"  - Numerical columns (x15-x21): StandardScaler")
print(f"  - Binary columns: Pass-through")
print(f"Feature selection: RFE with linear SVM, selecting {n_features_final} features")
print(f"Hyperparameters:")
print(f"  - C (regularization): {C_final:.2f}")
print(f"  - gamma (kernel coefficient): {gamma_final:.6f}")
print(f"  - Probability threshold: {best_thresh_nested:.3f}")
print(f"Cross-validation: 5-fold outer CV, 4-fold inner CV")
print(f"Final BER estimate: {best_ber_nested:.4f}")

Data shape: (10000, 21)
Class distribution: [7503 2497]

PROPER NESTED CROSS-VALIDATION
Running 5-fold outer CV (each fold has inner 4-fold CV for hyperparam tuning)...

--- Outer Fold 1/5 ---

--- Outer Fold 2/5 ---

--- Outer Fold 3/5 ---

--- Outer Fold 4/5 ---

--- Outer Fold 5/5 ---

Outer CV scores: [0.3550231709254619, 0.3433589363937736, 0.36064066841210735, 0.334, 0.339]
Mean Outer CV BER: 0.3464
Std Outer CV BER: 0.0099

THRESHOLD OPTIMIZATION ON OUTER CV PROBABILITIES
Optimal threshold from NESTED CV: 0.280
BER at optimal threshold: 0.3461

FINAL MODEL TRAINING
Most common parameter combination (across 5 folds):
  n_features_to_select: 18
  C: 63.58
  gamma: 0.013311
  Frequency: 3/5

Median parameter values (alternative approach):
  n_features_to_select: 18
  C: 63.58
  gamma: 0.013311

Using MEDIAN parameter values for final model...
Training final model on all labeled data...
Making predictions on unlabeled data...

Prediction distribution:
  Class 0: 6569 (65.7%)
  Class