# NLP Demo 

## Set up global variables and random_state

In [None]:
import numpy as np
import torch
from matplotlib import pyplot as plt
from datetime import datetime
from opendataval.util import set_random_state

device = torch.device("mps")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

## Choose Experiment parameters

In [None]:
from opendataval.dataloader import mix_labels
noise_rate=.1
nlp_datasets = ["bbc", "imdb"]
dataset_name = nlp_datasets[1]

nlp_models = ["BertClassifier"]

dataset_name = nlp_datasets[0]
train_count, valid_count, test_count = 100, 50, 50
add_noise = mix_labels
noise_rate = 0.1
noise_kwargs = {'noise_rate': noise_rate }


model_name = nlp_models[0]
train_kwargs = {"epochs": 2, "batch_size": 50}
metric_name = "accuracy"

## Get ExperimentMediator without specifying DataEvaluators

In [None]:
from opendataval.experiment import ExperimentMediator

exper_med = ExperimentMediator.model_factory_setup(
    dataset_name=dataset_name,
    cache_dir="../data_files/",  # Since move inside demo directory
    force_download=False,
    train_count=train_count,
    valid_count=valid_count,
    test_count=test_count,
    add_noise=add_noise,
    noise_kwargs=noise_kwargs,
    random_state=random_state,
    model_name=model_name,
    device=device,
    train_kwargs=train_kwargs,
    metric_name=metric_name
)

## Data Evaluators

#### Lots of imports for the many Data Evaluators

In [None]:
from opendataval.dataval.ame import AME
from opendataval.dataval.influence import InfluenceFunctionEval
from opendataval.dataval.oob import DataOob
from opendataval.dataval import LeaveOneOut
from opendataval.dataval import BetaShapley, DataShapley
from opendataval.dataval import DataBanzhaf

#### Set up a series of data evaluators

In [None]:
data_evaluators = [
    AME(num_models=1500, random_state=random_state),
    DataOob(random_state=random_state),  # 1000 samples
    # DVRL(rl_epochs=3000, random_state=random_state, device=device),  # DVRL requires tensor inputs
    LeaveOneOut(random_state=random_state),
    InfluenceFunctionEval(5000, random_state=random_state),
    DataBanzhaf(5000, random_state=random_state),
    BetaShapley(gr_threshold=1.05, mc_epochs=500, cache_name="cached", random_state=random_state),
    DataShapley(gr_threshold=1.05, mc_epochs=500, cache_name="cached", random_state=random_state),
]

In [None]:
exper_med = exper_med.compute_data_values(data_evaluators=data_evaluators)

## Running experiments on the data values

In [None]:
from opendataval.experiment.exper_methods import (
    discover_corrupted_sample,
    noisy_detection,
    remove_high_low,
    increasing_bin_removal,
    save_dataval
)

# Saving the results
output_dir = f"../tmp/{dataset_name}_{noise_rate=}/{date}/"
exper_med.set_output_directory(output_dir)
output_dir

#### Discover corrupted sample

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(discover_corrupted_sample, fig, col=2, save_output=True)

#### Noisy Detection

In [None]:
exper_med.evaluate(noisy_detection, save_output=True)

### Removing high values and low values

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(remove_high_low, fig, include_train=True, col=2, save_output=True)

### Increasing bin removal

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(increasing_bin_removal, fig, include_train=True, col=2, save_output=True)

#### Save data values

In [None]:
exper_med.evaluate(save_dataval, save_output=True)