In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.metrics import (roc_curve, auc, roc_auc_score, 
                             precision_recall_fscore_support)
from sklearn.model_selection import (train_test_split, GridSearchCV, 
                                     KFold, StratifiedKFold, cross_val_score, 
                                     cross_val_predict)
from sklearn.preprocessing import label_binarize, LabelEncoder, LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import (SelectKBest, f_classif, VarianceThreshold, 
                                       SelectFromModel)
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_iris
import itertools

from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import copy


# OvO and OvR prediction function

In [18]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
import itertools

def split_classes(X, y):
    return {
        (c1, c2): (X[(y == c1) | (y == c2)], y[(y == c1) | (y == c2)])
        for c1, c2 in itertools.combinations(np.unique(y), 2)
    }

def ovo_and_ova_multiclass_auc(X, y, base_clf, p_grid, random_state):
    results = {}
    le = LabelEncoder()
    y = le.fit_transform(y)
    class_names = le.classes_

    # Stratified K-Folds
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    ####################
    # One-vs-Rest Classification
    ####################
    print("Performing One vs Rest classification")
    ovr_clf = GridSearchCV(
        estimator=OneVsRestClassifier(base_clf),
        param_grid=p_grid,
        cv=inner_cv,
        scoring="roc_auc_ovr"
    )
    y_score = cross_val_predict(ovr_clf, X, y, cv=outer_cv, method="predict_proba")

    # Calculate AUC for each class
    y_bin = LabelBinarizer().fit_transform(y)
    ovr_auc = roc_auc_score(y_bin, y_score, multi_class="ovr", average=None)
    for idx, auc_val in enumerate(ovr_auc):
        print(f"AUC for class '{class_names[idx]}': {auc_val:.4f}")
        results[f"{class_names[idx]} vs Rest"] = auc_val

    # Calculate macro and micro AUC for OvR
    macro_ovr_auc = roc_auc_score(y_bin, y_score, multi_class="ovr", average="macro")
    micro_ovr_auc = roc_auc_score(y_bin, y_score, multi_class="ovr", average="micro")
    results["OvR Macro AUC"] = macro_ovr_auc
    results["OvR Micro AUC"] = micro_ovr_auc
    print(f"Macro AUC (OvR): {macro_ovr_auc:.4f}")
    print(f"Micro AUC (OvR): {micro_ovr_auc:.4f}")

    ####################
    # One-vs-One Classification
    ####################
    print("Performing One vs One classification")
    ovo_auc = {}
    class_pairs = split_classes(X, y)

    for (c1, c2), (X_subset, y_subset) in class_pairs.items():
        ovo_clf = GridSearchCV(
            estimator=base_clf,
            param_grid={k.replace("estimator__", ""): v for k, v in p_grid.items()},
            cv=inner_cv,
            scoring="roc_auc"
        )
        y_score = cross_val_predict(ovo_clf, X_subset, y_subset, cv=outer_cv, method="predict_proba")
        y_binary = (y_subset == c2).astype(int)
        fpr, tpr, _ = roc_curve(y_binary, y_score[:, 1])
        auc_val = auc(fpr, tpr)

        # Decode labels
        results[f"{le.inverse_transform([c1])[0]} vs {le.inverse_transform([c2])[0]}"] = auc_val
        ovo_auc[(c1, c2)] = auc_val

    # Calculate macro and micro AUC for OvO
    macro_ovo_auc = np.mean(list(ovo_auc.values()))  # Macro: Average AUC over all class pairs
    micro_ovo_auc = roc_auc_score(y, cross_val_predict(base_clf, X, y, cv=outer_cv, method="predict_proba"), multi_class="ovo", average="micro")  # Direct micro AUC for OvO
    results["OvO Macro AUC"] = macro_ovo_auc
    results["OvO Micro AUC"] = micro_ovo_auc
    print(f"Macro AUC (OvO): {macro_ovo_auc:.4f}")
    print(f"Micro AUC (OvO): {micro_ovo_auc:.4f}")

    return results


In [16]:
from sklearn.datasets import make_classification
import pandas as pd

# Generate a synthetic dataset
X, y = make_classification(
    n_samples=150,  # Number of samples
    n_features=1000,   # Total number of features
    n_informative=20,  # Number of informative features
    n_redundant=200,    # Number of redundant features
    n_classes=3,       # Number of target classes
    random_state=42    # For reproducibility
)

# Convert to DataFrame
X = pd.DataFrame(X)
y = pd.Series(y, name="target")

In [19]:
seed_results = {}

seeds = [1,2,3]
ks = [10,100]

for seed in seeds:
    ks_results = {}
    for k in ks:

        print(f"CV for seed {seed} and {k} features")

        # Define the classifier
        classifier = RandomForestClassifier(random_state=seed)

        # Create a Random Forest Classifier
        rf = RandomForestClassifier(n_estimators=100, random_state=seed)

        # Create a SelectFromModel using the Random Forest Classifier
        selector = SelectFromModel(rf, max_features = k)

        # Create a pipeline with feature selection and classification
        pipeline = Pipeline(steps=[
            ('feature_selection', selector),
            ('classification', rf)
        ])

        # Parameter grid for RandomForestClassifier
        p_grid = {
            "estimator__classification__n_estimators": [100],          # Number of trees in the forest
            "estimator__classification__max_features": ["sqrt"],       # Feature selection strategy
            "estimator__classification__criterion": ["entropy"],       # Split criterion
            "estimator__classification__min_samples_leaf": [3],        # Minimum samples per leaf
        }

        ###########################
            
        results = ovo_and_ova_multiclass_auc(X,y,pipeline, p_grid, random_state=seed)

        print(results)

        ks_results[k] = results

    seed_results[seed] = copy.copy(ks_results)
 


CV for seed 1 and 10 features
Performing One vs Rest classification
AUC for class '0': 0.7946
AUC for class '1': 0.7396
AUC for class '2': 0.7322
Macro AUC (OvR): 0.7555
Micro AUC (OvR): 0.7548
Performing One vs One classification


In [14]:
# Flatten the nested dictionary into a DataFrame
df = pd.DataFrame(
    {(outer_key, inner_key): values for outer_key, inner_dict in seed_results.items() for inner_key, values in inner_dict.items()}
).T

# Set multi-level index names for clarity
df.index.names = ['Seed', 'Features (k)']

# Display the DataFrame
print(df)

                   0 vs Rest  1 vs Rest  2 vs Rest  0 vs 1  0 vs 2  1 vs 2
Seed Features (k)                                                         
1    2                   1.0     0.9858     0.9858     1.0     1.0  0.9736
     3                   1.0     0.9858     0.9858     1.0     1.0  0.9736
2    2                   1.0     0.9894     0.9894     1.0     1.0  0.9768
     3                   1.0     0.9894     0.9894     1.0     1.0  0.9768
3    2                   1.0     0.9787     0.9687     1.0     1.0  0.9512
     3                   1.0     0.9787     0.9687     1.0     1.0  0.9512
