# Classification Demo 

## Set up global variables and random_state

In [None]:
import numpy as np
import torch
from matplotlib import pyplot as plt
from datetime import datetime
from dataoob.util import set_random_state

device = torch.device("cpu")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

## Set up data loader

#### Pick Noise rate and data set

In [None]:
noise_rate = .1 
classification_datasets = ["iris", "adult", "digits", "gaussian_classifier"]
dataset_name = classification_datasets[0]

In [None]:
from dataoob.dataloader import DataFetcher, mix_labels

# Equivalent arguments
fetcher = (
    DataFetcher(dataset_name, False, random_state)
    .split_dataset(100, 50, 50)
    .noisify(mix_labels, noise_rate=noise_rate)
)
num_points = len(fetcher.x_train)
covar_dim = (1,) if fetcher.x_train.ndim == 1 else fetcher.x_train[0].shape
label_dim = (1,) if fetcher.y_train.ndim == 1 else fetcher.y_train[0].shape

## Set up models

In [None]:
# Import examples of appropriate Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from dataoob.model import ClassifierSkLearnWrapper, ClassifierUnweightedSkLearnWrapper
from dataoob.model.logistic_regression import LogisticRegression as LR, BinaryLogisticRegression as BLR
from dataoob.model.ann import ClassifierMLP, BinaryMLP

In [None]:
catalog = {
    # Wrappers for sklearn modles, makes the api more cohesive
    'sklogreg': ClassifierSkLearnWrapper(LogisticRegression(), *label_dim, device=device),
    'skknn': ClassifierUnweightedSkLearnWrapper(KNeighborsClassifier(*label_dim), *label_dim, device=device),

    'logreg': LR(*covar_dim, *label_dim).to(device),
    'ann': ClassifierMLP(*covar_dim, *label_dim, layers=5, hidden_dim=25).to(device),

    'binlogreg': BLR(*covar_dim).to(device),
    'binann': BinaryMLP(*covar_dim, layers=5, hidden_dim=25).to(device),
}

#### Choose a model from the catalog

In [None]:
model_name = "ann"
metric_name = "accuracy"
train_kwargs = train_kwargs = {"epochs": 20, "batch_size": 10} if model_name in ("ann", "logreg") else {}

pred_model = catalog[model_name]

#### Base line performance

In [None]:
from dataoob.evaluator.api import metrics_dict
model = pred_model.clone()
x_train, y_train, x_valid, y_valid, *_ = fetcher.datapoints
model.fit(x_train, y_train, **train_kwargs)
metric = metrics_dict[metric_name]

metric(y_valid, model.predict(x_valid))

## Data Evaluators

#### Lots of imoprts for the many Data Evaluators

In [None]:
from dataoob.dataval.ame import AME
from dataoob.dataval.dvrl import DVRL
from dataoob.dataval.influence import InfluenceFunctionEval
from dataoob.dataval.knnshap import KNNShapley
from dataoob.dataval.oob import DataOob
from dataoob.dataval.margcontrib import LeaveOneOut
from dataoob.dataval.margcontrib import BetaShapley, DataShapley
from dataoob.dataval.margcontrib.banzhaf import DataBanzhaf

#### Set up a series of data evaluators

In [None]:
data_evaluators = [
    AME(num_models=1500, random_state=random_state),
    DataOob(random_state=random_state),  # 1000 samples
    DVRL(rl_epochs=3000, random_state=random_state, device=device),  # RL requires torch device
    InfluenceFunctionEval(5000, random_state=random_state),
    DataBanzhaf(5000, random_state=random_state),
    BetaShapley(gr_threshold=1.05, min_samples=500, cache_name="cached", random_state=random_state),
    DataShapley(gr_threshold=1.05, min_samples=500, cache_name="cached", random_state=random_state),
]

In [None]:
from dataoob.evaluator import ExperimentMediator
exper_med = ExperimentMediator(
    fetcher=fetcher,
    data_evaluators=data_evaluators, 
    pred_model=pred_model,
    train_kwargs=train_kwargs,
    metric_name=metric_name
)

## Running experiments on the data values

In [None]:
from dataoob.evaluator.exper_methods import (
    discover_corrupted_sample, 
    noisy_detection, 
    remove_high_low, 
    point_removal
)

# Saving the results
import os
output_dir = f"tmp/{dataset_name}{noise_rate=}/{date}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Discover corrupted sample

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp = exper_med.plot(discover_corrupted_sample, fig, col=2)
df_resp[0].to_csv(f"{output_dir}/discover_corrupted_sample.csv")

#### Noisy detection F1 scores

In [None]:
df_resp = exper_med.evaluate(noisy_detection)
df_resp.to_csv(f"{output_dir}/noisy_detection.csv")
df_resp

### Removing high values and low values

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(remove_high_low, include_train=True, col=2)
df_resp.to_csv(f"{output_dir}/remove_high_low.csv")

#### Remove descending values

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(point_removal, include_train=True, col=2, percentile=.05, order="descending")
df_resp.to_csv(f"{output_dir}/descending_remove.csv")

#### Remove ascending values

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(point_removal, include_train=True, col=2, order="ascending")
df_resp.to_csv(f"{output_dir}/ascending_remove.csv")