Demo of current progress with Dataoob

In [None]:
# Imports
import numpy as np
import torch
from matplotlib import pyplot as plt

Global state

In [None]:
from datetime import datetime
from dataoob.util import set_random_state
device = torch.device("cpu")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

Loading Data

In [None]:
from dataoob.dataloader.fetcher import DataFetcher, mix_labels
from dataoob.evaluator import ExperimentMediator, DataFetcherArgs

dataset_name = "iris"
noise_rate = .1

# Equivalent arguments
fetcher = (
    DataFetcher(dataset_name, False, random_state)
    .split_dataset(80, 30)
    .noisify(mix_labels, noise_rate=noise_rate)
)
num_points = len(fetcher.x_train)
covar_dim = len(fetcher.x_train[0])
label_dim = fetcher.y_train.shape[1]

Setting up the models and default arguments

Import models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from dataoob.model import ClassifierSkLearnWrapper, ClassifierUnweightedSkLearnWrapper
from dataoob.model.logistic_regression import LogisticRegression as LR
from dataoob.model.mlp import MLP

In [None]:
models = {
    # Wrappers for sklearn modles, makes the api more cohesive
    'sklogreg': ClassifierSkLearnWrapper(LogisticRegression(), label_dim, device=device),
    'logreg': LR(covar_dim, label_dim).to(device),
    'ann': MLP(covar_dim, label_dim, layers=3, hidden_dim=15).to(device),
    'skknn': ClassifierUnweightedSkLearnWrapper(KNeighborsClassifier(label_dim), label_dim, device=device)
}

Selecting your metrics and model

In [None]:
from dataoob.evaluator import DataEvaluatorArgs
model_name = "ann"
metric_name = "accuracy"
train_kwargs = {"epochs": 10, "batch_size": 20} if model_name in ("ann", "logreg") else {}

de_args = DataEvaluatorArgs(
    pred_model=models[model_name],
    metric_name=metric_name,
    train_kwargs=train_kwargs,
)

Base line model performance

In [None]:
from dataoob.evaluator.api import metrics_dict
model = models[model_name].clone()
x_train, y_train, x_valid, y_valid, *_ = fetcher.datapoints
model.fit(x_train, y_train, **train_kwargs)
metric = metrics_dict[metric_name]

metric(y_valid, model.predict(x_valid))

Data Evaluators present

Import data evaluators

In [None]:
from dataoob.dataval.influence import InfluenceFunctionEval
from dataoob.dataval.dvrl import DVRL
from dataoob.dataval.margcontrib import LeaveOneOut
from dataoob.dataval.oob import DataOob
from dataoob.dataval.knnshap import KNNShapley
from dataoob.dataval.margcontrib import DataShapley
from dataoob.dataval.margcontrib import BetaShapley
from dataoob.dataval.margcontrib.banzhaf import DataBanzhaf, DataBanzhafMargContrib
from dataoob.dataval.ame import BaggingEvaluator, AME

In [None]:
dummy_eval = [  # Used for quick testing and run throughs
    InfluenceFunctionEval(10, random_state=random_state),
    DataOob(10, random_state=random_state),
    DVRL(10, rl_epochs=10, random_state=random_state),
    LeaveOneOut(random_state=random_state),
    AME(10, random_state=random_state),
    DataBanzhaf(samples=10, random_state=random_state),
    BetaShapley(100, min_samples=99, model_name="t", random_state=random_state),
    DataShapley(model_name="t", random_state=random_state),
    DataShapley(100, min_samples=99, model_name="r", random_state=random_state),
]

data_evaluators = [  # actual run through of experiments, will take long time 
    InfluenceFunctionEval(2000, random_state=random_state),
    DataOob(random_state=random_state),
    DVRL(rl_epochs=2000, random_state=random_state),
    LeaveOneOut(random_state=random_state),
    AME(random_state=random_state),
    DataBanzhaf(10000, random_state=random_state),
    DataBanzhafMargContrib(gr_threshold=1.05, min_samples=500, model_name="t", random_state=random_state),
    BetaShapley(gr_threshold=1.05, min_samples=500, model_name="t", random_state=random_state),
    DataShapley(gr_threshold=1.05, min_samples=500, model_name="t", random_state=random_state),
]

Setting up the Evaluator Mediator

In [None]:
exper_med = ExperimentMediator(fetcher, data_evaluators, de_args.pred_model, de_args.train_kwargs, de_args.metric_name)

Plotting and getting results

In [None]:
# Imports
from dataoob.evaluator.exper_methods import (
    discover_corrupted_sample, noisy_detection, remove_high_low
)
import os
output_dir = f"tmp/{dataset_name}{noise_rate=}/{date}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp = exper_med.plot(discover_corrupted_sample, fig, col=2)
df_resp[0].to_csv(f"{output_dir}/discover_corrupted_sample.csv")

In [None]:
df_resp = exper_med.evaluate(noisy_detection)
df_resp.to_csv(f"{output_dir}/noisy_detection.csv")
df_resp

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(remove_high_low, include_train=True, col=2)
df_resp.to_csv(f"{output_dir}/remove_high_low.csv")

In [None]:
import pandas as pd
def read_saved_csv(file_path: str):
    return pd.read_csv(file_path, index_col=[0, 1])
read_saved_csv(f"{output_dir}/discover_corrupted_sample.csv")