Demo of current progress with Open Data Val

In [None]:
# Imports
import numpy as np
import torch
from matplotlib import pyplot as plt

Global state

In [None]:
from datetime import datetime
from opendataval.util import set_random_state
device = torch.device("cpu")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

Loading Data

In [None]:
from opendataval.dataloader import DataFetcher, mix_labels

dataset_name = "iris"
noise_rate = .1

# Equivalent arguments
fetcher = (
    DataFetcher(dataset_name, "../data_files/", False, random_state)
    .split_dataset_by_count(80, 30, 10)
    .noisify(mix_labels, noise_rate=noise_rate)
)
num_points = fetcher.num_points
covar_dim = fetcher.covar_dim[0]
label_dim = fetcher.label_dim[0]

Setting up the models and default arguments

Import models

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from opendataval.model import ClassifierSkLearnWrapper, ClassifierUnweightedSkLearnWrapper, RegressionSkLearnWrapper
from opendataval.model.logistic_regression import LogisticRegression as LR
from opendataval.model.mlp import ClassifierMLP

In [None]:
models = {
    # Wrappers for sklearn modles, makes the api more cohesive
    'sklogreg': ClassifierSkLearnWrapper(LogisticRegression(), label_dim),
    LogisticRegression: LR(covar_dim, label_dim).to(device),
    'mlp': ClassifierMLP(covar_dim, label_dim, layers=3, hidden_dim=15).to(device),
    'skknn': ClassifierUnweightedSkLearnWrapper(KNeighborsClassifier(label_dim), label_dim)
}

Selecting your metrics and model

In [None]:
model_name = "mlp"
metric_name = "accuracy"
train_kwargs = {"epochs": 10, "batch_size": 20} if model_name in ("mlp", "logreg") else {}
pred_model = models[model_name]

Base line model performance

In [None]:
from opendataval.experiment.api import metrics_dict
model = pred_model.clone()
x_train, y_train, *_, x_test, y_test = fetcher.datapoints
model.fit(x_train, y_train, **train_kwargs)
metric = metrics_dict[metric_name]

metric(y_test, model.predict(x_test))

Data Evaluators present

Import data evaluators

In [None]:
from opendataval.dataval.influence import InfluenceFunctionEval
from opendataval.dataval.dvrl import DVRL
from opendataval.dataval.margcontrib import LeaveOneOut
from opendataval.dataval.oob import DataOob
from opendataval.dataval.knnshap import KNNShapley
from opendataval.dataval.margcontrib import DataShapley
from opendataval.dataval.margcontrib import BetaShapley
from opendataval.dataval.margcontrib.banzhaf import DataBanzhaf, DataBanzhafMargContrib
from opendataval.dataval.ame import BaggingEvaluator, AME

In [None]:
dummy_eval = [  # Used for quick testing and run throughs
    InfluenceFunctionEval(10, random_state=random_state),
    DataOob(10, random_state=random_state),
    DVRL(10, rl_epochs=10, random_state=random_state),
    LeaveOneOut(random_state=random_state),
    AME(10, random_state=random_state),
    DataBanzhaf(num_models=10, random_state=random_state),
    DataBanzhafMargContrib(99, max_mc_epochs=2, models_per_iteration=1, cache_name="cache_dummy", random_state=random_state),
    BetaShapley(99, max_mc_epochs=2, models_per_iteration=1, cache_name="cache_dummy", random_state=random_state),
    DataShapley(cache_name="cache_dummy", random_state=random_state),
    DataShapley(99, max_mc_epochs=2, models_per_iteration=1, cache_name="cache_preset_other", random_state=random_state),
]

data_evaluators = [  # actual run through of experiments, will take long time
    InfluenceFunctionEval(2000, random_state=random_state),
    DataOob(random_state=random_state),
    DVRL(rl_epochs=2000, random_state=random_state),
    LeaveOneOut(random_state=random_state),
    AME(random_state=random_state),
    DataBanzhaf(10000, random_state=random_state),
    DataBanzhafMargContrib(gr_threshold=1.05, mc_epochs=500, cache_name="cached", random_state=random_state),
    BetaShapley(gr_threshold=1.05, mc_epochs=500, cache_name="cached", random_state=random_state),
    DataShapley(gr_threshold=1.05, mc_epochs=500, cache_name="cached", random_state=random_state),
]

Setting up the Evaluator Mediator

In [None]:
from opendataval.experiment import ExperimentMediator
exper_med = ExperimentMediator(fetcher, pred_model, train_kwargs, metric_name).compute_data_values(dummy_eval)

Plotting and getting results

In [None]:
from opendataval.experiment.exper_methods import (
    discover_corrupted_sample,
    noisy_detection,
    remove_high_low,
    increasing_bin_removal,
    save_dataval
)

# Saving the results
output_dir = f"../tmp/{dataset_name}_{noise_rate=}/{date}/"
exper_med.set_output_directory(output_dir)

#### Discover corrupted sample

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(discover_corrupted_sample, fig, col=2, save_output=True)

#### Noisy sample F1 score

In [None]:
exper_med.evaluate(noisy_detection, save_output=True)

### Removes high/low and evaluates

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(remove_high_low, fig, include_train=True, col=2, save_output=True)

#### Increasing Bin Removal

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(increasing_bin_removal, fig, include_train=True, col=2, save_output=True)

#### Saves data values

In [None]:
exper_med.evaluate(save_dataval, save_output=True)

In [None]:
from opendataval.util import load_mediator_output
load_mediator_output(f"{output_dir}/discover_corrupted_sample.csv")