In [1]:
import pmlb

from cleanlab_runner import ClassificationEvaluator, RegressionEvaluator, IssueHandler

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
classification_datasets = {
    "iris": pmlb.fetch_data('iris'),
    "wine": pmlb.fetch_data('wine_recognition'),
    "breast_cancer": pmlb.fetch_data('breast_cancer'),
    "spambase": pmlb.fetch_data('spambase'),
    "adult": pmlb.fetch_data('adult'),
    "connect_4": pmlb.fetch_data('connect_4'),
    #"cars": pmlb.fetch_data('cars')
}

regression_datasets = {
    "cpu": pmlb.fetch_data('197_cpu_act'),
    "esl": pmlb.fetch_data('1027_ESL'),
    "satellite": pmlb.fetch_data('294_satellite_image'),
    "mv": pmlb.fetch_data('344_mv')
}

In [4]:
classification_models = {
    'logistic_regression': LogisticRegression(max_iter=1000),
    'random_forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=5),
    'decision_tree': DecisionTreeClassifier(random_state=42),
}

regression_models = {
    'linear_regression': LinearRegression(),
    #'ridge_regression': Ridge(alpha=1.0),
    'lasso_regression': Lasso(alpha=0.1),
    'random_forest': RandomForestRegressor(n_estimators=100, random_state=42),
    #'svr': SVR(kernel='rbf')
}

In [13]:
import pandas as pd
import os

log_path = "training_log.csv"

for ds_name, ds in classification_datasets.items():
    for model_name, model in classification_models.items():
        print(f"Evaluating {model_name} on {ds_name}")

        # 1. Original dataset evaluation
        evaluator_orig = ClassificationEvaluator(ds_name, ds.copy(), model, task='classification')
        evaluator_orig.train()
        evaluator_orig.evaluate()
        original_metric = round(evaluator_orig.metric, 4)

        # 2. Cleaned dataset evaluation + logging
        cleaned, issues = IssueHandler(ds, task='classification', method='remove').clean()
        evaluator_clean = ClassificationEvaluator(ds_name, cleaned, model, task='classification')
        evaluator_clean.train()
        evaluator_clean.evaluate()
        evaluator_clean.log_results(path=log_path)
        cleaned_metric = round(evaluator_clean.metric, 4)

        # 3. Compute improvement
        improvement = round(cleaned_metric - original_metric, 4)

        # 4. Update the last row in the log
        df = pd.read_csv(log_path)
        if not df.empty:
            df.loc[df.index[-1], 'original_metric'] = original_metric
            df.loc[df.index[-1], 'cleaned_metric'] = cleaned_metric
            df.loc[df.index[-1], 'improvement'] = improvement
            df.to_csv(log_path, index=False)


Evaluating logistic_regression on iris
[Classification] iris - LogisticRegression: Accuracy=1.0000 | CV=0.9667±0.0094
Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Finding non_iid issues ...
Finding class_imbalance issues ...

Audit complete. 9 issues found in the dataset.
[Classification] iris - LogisticRegression: Accuracy=1.0000 | CV=1.0000±0.0000
Evaluating random_forest on iris
[Classification] iris - RandomForestClassifier: Accuracy=1.0000 | CV=0.9667±0.0094
Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Finding non_iid issues ...
Finding class_imbalance issues ...

Audit complete. 9 issues found in the dataset.
[Classification] iris - RandomForestClassifier: Accuracy=1.0000 | CV=1.0000±0.0000
Evaluating knn on iris
[Classification] iris - KNeighborsClassifier: Accuracy=1.0000 | CV=0.9667±0.0189
Finding label issues ...
Finding outlier issues ...
Fitting OOD es

In [14]:
log_path = "training_log.csv"

for ds_name, ds in regression_datasets.items():
    for model_name, model in regression_models.items():
        print(f"Evaluating {model_name} on {ds_name}")

        # 1. Original dataset evaluation
        evaluator_orig = RegressionEvaluator(ds_name, ds.copy(), model, task='regression')
        evaluator_orig.train()
        evaluator_orig.evaluate()
        original_metric = round(evaluator_orig.metric, 4)

        # 2. Cleaned dataset evaluation + logging
        cleaned, issues = IssueHandler(ds, task='regression', method='remove').clean()
        evaluator_clean = RegressionEvaluator(ds_name, cleaned, model, task='regression')
        evaluator_clean.train()
        evaluator_clean.evaluate()
        evaluator_clean.log_results(path=log_path)
        cleaned_metric = round(evaluator_clean.metric, 4)

        # 3. Compute improvement
        improvement = round(cleaned_metric - original_metric, 4)

        # 4. Update the last row in the log
        df = pd.read_csv(log_path)
        if not df.empty:
            df.loc[df.index[-1], 'original_metric'] = original_metric
            df.loc[df.index[-1], 'cleaned_metric'] = cleaned_metric
            df.loc[df.index[-1], 'improvement'] = improvement
            df.to_csv(log_path, index=False)



Evaluating linear_regression on cpu
[Regression] cpu - LinearRegression: RMSE=8.8765 | CV=9.7359±2.6888
[Regression] cpu - LinearRegression: RMSE=0.9780 | CV=0.9671±0.2731
Evaluating lasso_regression on cpu
[Regression] cpu - Lasso: RMSE=8.8762 | CV=9.7375±2.6745
[Regression] cpu - Lasso: RMSE=0.9869 | CV=0.9911±0.3120
Evaluating random_forest on cpu
[Regression] cpu - RandomForestRegressor: RMSE=2.4607 | CV=2.4984±0.5599
[Regression] cpu - RandomForestRegressor: RMSE=3.1241 | CV=2.4444±1.4733
Evaluating linear_regression on esl
[Regression] esl - LinearRegression: RMSE=0.5269 | CV=0.5333±0.0461
[Regression] esl - LinearRegression: RMSE=0.5040 | CV=0.4975±0.1712
Evaluating lasso_regression on esl
[Regression] esl - Lasso: RMSE=0.5442 | CV=0.5463±0.1302
[Regression] esl - Lasso: RMSE=0.5259 | CV=0.5106±0.2087
Evaluating random_forest on esl
[Regression] esl - RandomForestRegressor: RMSE=0.6371 | CV=0.5832±0.2065
[Regression] esl - RandomForestRegressor: RMSE=0.5194 | CV=0.5471±0.2111
Ev

In [15]:
training_log = pd.read_csv(log_path)
training_log

Unnamed: 0,dataset,task,model,metric,cv_mean,cv_std,train_time,original_metric,cleaned_metric,improvement
0,iris,classification,LogisticRegression,1.0,1.0,0.0,0.021519,1.0,1.0,0.0
1,iris,classification,RandomForestClassifier,1.0,1.0,0.0,0.121836,1.0,1.0,0.0
2,iris,classification,KNeighborsClassifier,1.0,1.0,0.0,0.002115,1.0,1.0,0.0
3,iris,classification,DecisionTreeClassifier,1.0,1.0,0.0,0.003124,1.0,1.0,0.0
4,wine,classification,LogisticRegression,0.942857,0.959064,0.036049,0.161757,0.9722,0.9429,-0.0293
5,wine,classification,RandomForestClassifier,0.971429,0.982456,0.014325,0.132591,1.0,0.9714,-0.0286
6,wine,classification,KNeighborsClassifier,0.714286,0.684211,0.074432,0.002457,0.7222,0.7143,-0.0079
7,wine,classification,DecisionTreeClassifier,0.971429,0.888889,0.057892,0.004205,0.9444,0.9714,0.027
8,breast_cancer,classification,LogisticRegression,0.939394,0.926936,0.025478,0.008861,0.7931,0.9394,0.1463
9,breast_cancer,classification,RandomForestClassifier,0.939394,0.945342,0.025554,0.18549,0.7069,0.9394,0.2325
