# NLP Demo 

## Set up global variables and random_state

In [1]:
import numpy as np
import torch
from matplotlib import pyplot as plt
from datetime import datetime
from dataoob.util import set_random_state

device = torch.device("mps")
random_state = set_random_state(10)
date = datetime.now().strftime("%m-%d_%H:%M")

Initial random seed is: 10.


## Set up data loader

#### Pick Noise rate and data set

In [2]:
import dataoob.dataloader.datasets.nlpsets  # Must import as NLP data sets aren't imported automatically
nlp_datasets = ["bbc", "imdb"]
dataset_name = nlp_datasets[0]

In [3]:
from dataoob.dataloader import DataFetcher

# Equivalent arguments
fetcher = (
    DataFetcher(dataset_name, False, random_state)
    .split_dataset(100, 50, 50)  # No noise functions for NLP yet
)
num_points = len(fetcher.x_train)
label_dim = (1,) if fetcher.y_train.ndim == 1 else fetcher.y_train[0].shape

## Set up models

In [4]:
# Import examples of appropriate Models
# TODO think of more Regression models
from dataoob.model.bert import BertClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
catalog = {
    # Only one NLP model so far
    'bert': BertClassifier("distilbert-base-uncased", *label_dim).to(device=device),
}

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Choose a model from the catalog

In [12]:
model_name = "bert"
metric_name = "accuracy"
train_kwargs = {"epochs": 2, "batch_size": 50} 

pred_model = catalog[model_name]

#### Base line performance

In [13]:
from dataoob.evaluator.api import metrics_dict
model = pred_model.clone()
x_train, y_train, *_, x_test, y_test = fetcher.datapoints

model.fit(x_train, y_train, **train_kwargs)
metric = metrics_dict[metric_name]

metric(y_test, model.predict(x_test).cpu())

100%|██████████| 2/2 [00:02<00:00,  1.07s/it]


0.9599999785423279

## Data Evaluators

#### Lots of imoprts for the many Data Evaluators

In [14]:
from dataoob.dataval.ame import AME
from dataoob.dataval.dvrl import DVRL
from dataoob.dataval.influence import InfluenceFunctionEval
from dataoob.dataval.knnshap import KNNShapley
from dataoob.dataval.oob import DataOob
from dataoob.dataval import LeaveOneOut
from dataoob.dataval import BetaShapley, DataShapley
from dataoob.dataval import DataBanzhaf

#### Set up a series of data evaluators

In [15]:
data_evaluators = [
    AME(num_models=1500, random_state=random_state),
    DataOob(random_state=random_state),  # 1000 samples
    # DVRL(rl_epochs=3000, random_state=random_state, device=device),  # DVRL requires tensor inputs
    LeaveOneOut(random_state=random_state),
    InfluenceFunctionEval(5000, random_state=random_state),
    DataBanzhaf(5000, random_state=random_state),
    BetaShapley(gr_threshold=1.05, min_samples=500, cache_name="cached", random_state=random_state),
    DataShapley(gr_threshold=1.05, min_samples=500, cache_name="cached", random_state=random_state),
]

In [17]:
from dataoob.evaluator import ExperimentMediator
exper_med = ExperimentMediator(
    fetcher=fetcher,
    data_evaluators=data_evaluators,
    pred_model=pred_model,
    train_kwargs=train_kwargs,
    metric_name=metric_name
)

100%|██████████| 2/2 [00:01<00:00,  1.15it/s]
100%|██████████| 2/2 [00:00<00:00,  4.87it/s]it]
100%|██████████| 2/2 [00:00<00:00,  5.13it/s]it]
100%|██████████| 2/2 [00:00<00:00,  5.02it/s]/s]
100%|██████████| 2/2 [00:00<00:00,  4.86it/s]/s]
100%|██████████| 2/2 [00:00<00:00,  4.98it/s]/s]
100%|██████████| 2/2 [00:00<00:00,  4.92it/s]/s]
100%|██████████| 2/2 [00:00<00:00, 12.06it/s]/s]
100%|██████████| 2/2 [00:00<00:00, 11.76it/s]/s]
100%|██████████| 2/2 [00:00<00:00, 12.14it/s]/s]
100%|██████████| 2/2 [00:00<00:00, 10.14it/s]t/s]
100%|██████████| 2/2 [00:00<00:00,  9.89it/s]t/s]
100%|██████████| 2/2 [00:00<00:00, 11.92it/s]t/s]
100%|██████████| 2/2 [00:00<00:00,  4.93it/s]t/s]
100%|██████████| 2/2 [00:00<00:00, 11.42it/s]t/s]
100%|██████████| 2/2 [00:00<00:00,  3.17it/s]t/s]
100%|██████████| 2/2 [00:00<00:00, 12.27it/s]t/s]
100%|██████████| 2/2 [00:00<00:00,  4.24it/s]t/s]
100%|██████████| 2/2 [00:00<00:00, 10.86it/s]t/s]
100%|██████████| 2/2 [00:00<00:00,  4.82it/s]t/s]
100%|████████

KeyboardInterrupt: 

## Running experiments on the data values

In [None]:
from dataoob.evaluator.exper_methods import remove_high_low, increasing_bin_removal

# Saving the results
import os
output_dir = f"tmp/{dataset_name}/{date}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Removing high values and low values

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(remove_high_low, include_train=True, col=2)
df_resp.to_csv(f"{output_dir}/remove_high_low.csv")

### Increasing bin removal

In [None]:
fig = plt.figure(figsize=(15, 15))
df_resp, fig = exper_med.plot(increasing_bin_removal, include_train=True, col=2)
df_resp.to_csv(f"{output_dir}/increasing_bin_removal.csv")