# Validation Embeddings

## Set up global variables and random_state

In [1]:
import numpy as np
import torch
from matplotlib import pyplot as plt
from datetime import datetime
from opendataval.util import set_random_state

device = torch.device("mps")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

Initial random seed is: 10.


## Choose Data set and load embeddings, 
#### prevents recomputing embeddings on subsequent loads

In [2]:
from opendataval.dataloader import mix_labels, DataFetcher
embedding_datasets = ["imagenet-val", "imagenet-val-embeddings", "cifar10-val", "cifar10-val-embeddings", "cifar10-val-embeddings", "cifar100-val-embeddings"]

dataset_name = embedding_datasets[1]
fetcher = DataFetcher(dataset_name, "../data_files/", False, 10)  # Defined here to repeat fetching embeddings]

## Finish setting up the fetcher 

In [None]:
train_count, valid_count, test_count = 20000, 2000, 5000
add_noise = mix_labels
noise_rate = 0
noise_kwargs = {'noise_rate': noise_rate }

fetcher = (
    fetcher
    .split_dataset_by_count(train_count, valid_count, test_count)
    .noisify(add_noise, **noise_kwargs)
)


## Get ExperimentMediator without specifying DataEvaluators

In [None]:
from opendataval.model import ModelFactory

classification_models = ["LogisticRegression", 'ClassifierMLP', "skmlp", "sklogreg", "skknn"]
model_name = classification_models[0]
model = ModelFactory(model_name, fetcher, "mps").clone()
train_kwargs = {"epochs": 10, "batch_size": 250, "lr": 0.001}
metric_name = "accuracy"

# model = ClassifierMLP(2048, 1000, 5, 250).to("mps")

x_train, y_train, *_, x_valid, y_valid = fetcher.datapoints
model.fit(x_train, y_train, **train_kwargs)
print((model.predict(x_valid).cpu().argmax(1) == y_valid.argmax(1)).float().mean())

In [None]:
from opendataval.util import ParamSweep
def _acc(a, b):
    return (a.argmax(1) == b.argmax(1)).float().mean()
ParamSweep(model, _acc, fetcher=fetcher, samples=10).sweep(
    epochs=[10, 25],
    batch_size=[250, 1000],
    lr=[0.01, 0.001]
)

# Setting up mediator

In [None]:
from opendataval.experiment import ExperimentMediator
exper_med = ExperimentMediator(fetcher, model, train_kwargs, metric_name)

## Data Evaluators

#### Lots of imports for the many Data Evaluators

In [None]:
from opendataval.dataval.ame import AME
from opendataval.dataval.influence import InfluenceFunctionEval
from opendataval.dataval.oob import DataOob
from opendataval.dataval.dvrl import DVRL
from opendataval.dataval import LeaveOneOut
from opendataval.dataval import BetaShapley, DataShapley
from opendataval.dataval import DataBanzhaf

#### Set up a series of data evaluators

In [None]:
data_evaluators = [
    AME(num_models=1500, random_state=random_state),
    DataOob(random_state=random_state),  # 1000 samples
    # DVRL(rl_epochs=3000, random_state=random_state, device=device),  # DVRL requires tensor inputs
    LeaveOneOut(random_state=random_state),
    InfluenceFunctionEval(5000, random_state=random_state),
    DataBanzhaf(5000, random_state=random_state),
    BetaShapley(gr_threshold=1.05, mc_epochs=500, cache_name="cached", random_state=random_state),
    DataShapley(gr_threshold=1.05, mc_epochs=500, cache_name="cached", random_state=random_state),
]

In [None]:
exper_med = exper_med.compute_data_values(data_evaluators=data_evaluators)

## Running experiments on the data values

In [None]:
from opendataval.experiment.exper_methods import (
    discover_corrupted_sample,
    noisy_detection,
    remove_high_low,
    increasing_bin_removal,
    save_dataval
)

# Saving the results
output_dir = f"../tmp/{dataset_name}_{noise_rate=}/{date}/"
exper_med.set_output_directory(output_dir)
output_dir

#### Discover corrupted sample

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(discover_corrupted_sample, fig, col=2, save_output=True)

#### Noisy Detection

In [None]:
exper_med.evaluate(noisy_detection, save_output=True)

### Removing high values and low values

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(remove_high_low, fig, include_train=True, col=2, save_output=True)

### Increasing bin removal

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(increasing_bin_removal, fig, include_train=True, col=2, save_output=True)

#### Save data values

In [None]:
exper_med.evaluate(save_dataval, save_output=True)