# Validation Embeddings

## Set up global variables and random_state

In [None]:
import numpy as np
import torch
from matplotlib import pyplot as plt
from datetime import datetime
from opendataval.util import set_random_state

device = torch.device("mps")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

## Choose Data set and load embeddings, 
#### prevents recomputing embeddings on subsequent loads

In [None]:
from opendataval.dataloader import mix_labels, DataFetcher
embedding_datasets = ["imagenet-val", "imagenet-val-embeddings", "cifar10-val", "cifar10-val-embeddings", "cifar10-val-embeddings", "cifar100-val-embeddings"]

dataset_name = embedding_datasets[1]
fetcher = DataFetcher(dataset_name, "../data_files/", False, 10)  # Defined here to repeat fetching embeddings]

## Finish setting up the fetcher 

In [None]:
train_count, valid_count, test_count = 25000, 2000, 5000
add_noise_func = mix_labels
noise_rate = 0
noise_kwargs = {'noise_rate': noise_rate }

fetcher = (
    fetcher
    .split_dataset_by_count(train_count, valid_count, test_count)
)


## Eval function

In [None]:
def acc(a, b) -> float:
    return (a.cpu().argmax(1) == b.cpu().argmax(1)).float().mean().item()

# Using a logreg model as the downstream model

In [None]:
from opendataval.model import ModelFactory

classification_models = ["LogisticRegression", 'ClassifierMLP', "skmlp", "sklogreg", "skknn"]
model_name = classification_models[0]
pred_model = ModelFactory(model_name, fetcher, device)
model = pred_model.clone()
train_kwargs = {"epochs": 10, "batch_size": 250, "lr": 0.001}
metric_name = "accuracy"

# model = ClassifierMLP(2048, 1000, 5, 250).to("mps")

x_train, y_train,  x_valid, y_valid, *_ = fetcher.datapoints
model.fit(x_train, y_train, **train_kwargs)
print("Accuracy: = ")
print(acc(model.predict(x_valid), y_valid))

In [None]:
from opendataval.util import ParamSweep

output = ParamSweep(pred_model, acc, fetcher=fetcher, samples=10).sweep(
    epochs=[5, 10, 20],
    batch_size=[100, 250, 1000],
    lr=[0.01, 0.001]
)
output

## Reduce input size

In [None]:
from opendataval.util import MeanStdTime
import time
import tqdm
REPEAT_COUNT = 10
best_sweep_kwargs = {'epochs': 5, 'batch_size': 100, 'lr': 0.001}  # Fastest 5 iteration
def change_train_size(train_size: int):
    perf_list = []
    
    # This will always reset the original fetcher
    x_train, y_train, x_valid, y_valid, *_ = fetcher.split_dataset_by_count(train_size, valid_count, test_count).datapoints
    start = time.perf_counter()
    for _ in tqdm.trange(REPEAT_COUNT):
        curr_model = pred_model.clone()
        curr_model.fit(x_train, y_train, **best_sweep_kwargs)
        yhat = curr_model.predict(x_valid).cpu()
        perf = acc(yhat, y_valid)
        perf_list.append(perf)

    end = time.perf_counter()
    mean_std = MeanStdTime(perf_list, end-start)
    print(f"{train_size=} | {mean_std}")
    return mean_std


## Change noise rate

In [None]:
change_train_size(25000) # Default
change_train_size(2500) 
change_train_size(250) 

In [None]:
from opendataval.dataloader.noisify import add_gauss_noise
from opendataval.util import MeanStdTime
import time
import tqdm
REPEAT_COUNT = 5  # To save time
best_sweep_kwargs = {'epochs': 5, 'batch_size': 250, 'lr': .001}

def change_noise_rate(noise_rate: float):
    perf_list = []

    # This will always reset the original fetcher
    x_train, y_train, x_valid, y_valid, *_ = (
        DataFetcher(dataset_name, "../data_files/", False, 10)
        .split_dataset_by_count(train_count, valid_count, test_count)
        .noisify(mix_labels, noise_rate=noise_rate)
    ).datapoints
    start_time = time.perf_counter()

    for _ in tqdm.trange(REPEAT_COUNT):
        curr_model = pred_model.clone()
        curr_model.fit(x_train, y_train, **best_sweep_kwargs)
        yhat = curr_model.predict(x_valid).cpu()
        perf = acc(yhat, y_valid)
        perf_list.append(perf)

    end_time = time.perf_counter()
    mean_std = MeanStdTime(perf_list, end_time-start_time)
    print(f"{noise_rate=} | {mean_std}")
    return mean_std


In [None]:
change_noise_rate(0.) # Default
change_noise_rate(.1)
change_noise_rate(.2) 
change_noise_rate(.3) 
change_noise_rate(.4)