# Classification Demo 

## Set up global variables and random_state

In [None]:
import numpy as np
import torch
from matplotlib import pyplot as plt
from datetime import datetime
from opendataval.util import set_random_state

device = torch.device("cpu")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

## Choose Experiment parameters

In [None]:
classification_datasets = ["iris", "adult", "digits", "gaussian_classifier"]
classification_models = ['logreg', 'mlpclass', "skmlp", "sklogreg", "skknn"]

dataset_name = classification_datasets[1]
train_count, valid_count, test_count = 100, 50, 50
noise_rate = 0.1
noise_kwargs = {'noise_rate': noise_rate }


model_name = classification_models[4]
train_kwargs = {"epochs": 20, "batch_size": 50} if "sk" not in model_name else {}
metric_name = "accuracy"

## Get ExperimentMediator without specifying DataEvaluators

In [None]:
from opendataval.experiment import ExperimentMediator

exper_med = ExperimentMediator.model_factory_setup(
    dataset_name=dataset_name,
    force_download=False,
    train_count=train_count,
    valid_count=valid_count,
    test_count=test_count,
    noise_kwargs=noise_kwargs,
    random_state=random_state,
    model_name=model_name,
    device=device,
    train_kwargs=train_kwargs,
    metric_name=metric_name
)

## Data Evaluators

#### Lots of imports for the many Data Evaluators

In [None]:
from opendataval.dataval.ame import AME
from opendataval.dataval.dvrl import DVRL
from opendataval.dataval.influence import InfluenceFunctionEval
from opendataval.dataval.knnshap import KNNShapley
from opendataval.dataval.oob import DataOob
from opendataval.dataval.margcontrib import LeaveOneOut
from opendataval.dataval.margcontrib import BetaShapley, DataShapley
from opendataval.dataval.margcontrib.banzhaf import DataBanzhaf

#### Set up a series of data evaluators

In [None]:
data_evaluators = [
    AME(num_models=1500, random_state=random_state),
    DataOob(2000,random_state=random_state),  # 1000 samples
    DVRL(rl_epochs=4000, random_state=random_state, device=device),  # RL requires torch device
    InfluenceFunctionEval(5000, random_state=random_state),
    DataBanzhaf(5000, random_state=random_state),
    BetaShapley(gr_threshold=1.05, min_samples=500, cache_name="cached", random_state=random_state),
    DataShapley(gr_threshold=1.05, min_samples=500, cache_name="cached", random_state=random_state),
]

In [None]:
exper_med = exper_med.compute_data_values(data_evaluators=data_evaluators)

## Running experiments on the data values

In [None]:
from opendataval.experiment.exper_methods import (
    discover_corrupted_sample,
    noisy_detection,
    remove_high_low,
    increasing_bin_removal,
    save_dataval
)

# Saving the results
import os
output_dir = f"tmp/{dataset_name}{noise_rate=}/{date}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Discover corrupted sample

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp = exper_med.plot(discover_corrupted_sample, fig, col=2)
df_resp[0].to_csv(f"{output_dir}/discover_corrupted_sample.csv")

#### Noisy detection F1 scores

In [None]:
df_resp = exper_med.evaluate(noisy_detection)
df_resp.to_csv(f"{output_dir}/noisy_detection.csv")
df_resp

### Remove High Low

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp = exper_med.plot(remove_high_low, fig, include_train=True, col=2)
df_resp[0].to_csv(f"{output_dir}/remove_high_low.csv")

### Increasing Bin removal

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp = exper_med.plot(increasing_bin_removal, fig, include_train=True, col=2)
df_resp[0].to_csv(f"{output_dir}/increasing_bin_removal.csv")

#### Save data values

In [None]:
df_resp = exper_med.evaluate(save_dataval)
df_resp.to_csv(f"{output_dir}/save_dataval.csv")
df_resp