In [2]:
# Re-import necessary packages after code execution state reset
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, confusion_matrix
from sklearn.utils import resample
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

# Load the data
df = pd.read_csv("Cancer2025exam.csv")
X = df.drop(columns=["V1"])
y = df["V1"]

In [3]:


# Define a custom preprocessing wrapper to allow user-defined preprocessing
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, preprocessing_fn=None):
        self.preprocessing_fn = preprocessing_fn

    def fit(self, X, y=None):
        return self  # nothing to fit here

    def transform(self, X):
        if self.preprocessing_fn:
            return self.preprocessing_fn(X)
        return X

# Define a wrapper to enable class-balanced resampling inside the CV folds
class ResampleWrapper(BaseEstimator):
    def __init__(self, classifier):
        self.classifier = classifier

    def fit(self, X, y):
        # Equal sampling: downsample all classes to match the smallest class
        df = pd.DataFrame(X)
        df['label'] = y.values if hasattr(y, 'values') else y

        min_class_size = df['label'].value_counts().min()
        balanced_df = pd.concat([
            resample(group, replace=False, n_samples=min_class_size, random_state=42)
            for _, group in df.groupby('label')
        ])
        X_balanced = balanced_df.drop(columns='label').values
        y_balanced = balanced_df['label'].values

        self.classifier_ = clone(self.classifier)
        self.classifier_.fit(X_balanced, y_balanced)
        return self

    def predict(self, X):
        return self.classifier_.predict(X)

# Define scorers for accuracy, sensitivity (macro recall), and specificity (macro TN rate)
def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    TN = []
    FP = []
    for i in range(len(cm)):
        tn = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
        fp = cm[:, i].sum() - cm[i, i]
        TN.append(tn)
        FP.append(fp)
    specificity = np.mean([tn / (tn + fp) if (tn + fp) > 0 else 0 for tn, fp in zip(TN, FP)])
    return specificity

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'sensitivity_macro': make_scorer(recall_score, average='macro'),
    'specificity_macro': make_scorer(specificity_score)
}

# Set up a reusable pipeline factory
def create_pipeline(classifier, preprocessing_fn=None):
    return Pipeline([
        ('preprocess', CustomPreprocessor(preprocessing_fn)),
        ('scale', StandardScaler()),
        ('resample_clf', ResampleWrapper(classifier))
    ])




In [5]:
%pip install ace_tools


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\Nils\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
# Example usage (user provides a classifier, e.g., RandomForestClassifier())
clf = RandomForestClassifier(random_state=42)

# Create pipeline
pipeline = create_pipeline(clf)

# Cross-validate
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(pipeline, X, y, cv=cv, scoring=scoring)

# Summarize
results_df = pd.DataFrame(cv_results)[['test_accuracy', 'test_sensitivity_macro', 'test_specificity_macro']]
import ace_tools as tools; tools.display_dataframe_to_user(name="CV Performance Summary", dataframe=results_df)

ModuleNotFoundError: No module named 'ace_tools'

In [7]:
restart channel = True

SyntaxError: invalid syntax (192839687.py, line 1)