In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from jatic_toolbox import (
    load_dataset,
    load_model,
    list_metrics,
    load_metric,
    evaluate,
)

## Load Data

In [3]:
 # Load test split of CIFAR-10 from TorchVision to evaluate pretrained models.

from torchvision.transforms.functional import to_tensor

data = load_dataset(
    provider="torchvision",
    dataset_name="CIFAR10",
    task="image-classification",
    split="test",
    root="~/.cache/torchvision/datasets",
    download=True
)

data.set_transform(lambda x: {"image": to_tensor(x["image"]), "label": x["label"]})

Files already downloaded and verified


In [4]:
 # Get mapping from integers to class names
 
int2name = data.features["label"]["names"]
int2name

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

## Load Model

In [5]:
# Load a ViT model (vision transformer)

model = load_model(
    provider="huggingface",
    model_name="aaraki/vit-base-patch16-224-in21k-finetuned-cifar10",
    task="image-classification"
)

In [9]:
# Verify model interface works with data

# HuggingFace models have a preprocessor to convert PIL images to tensors
input = model.preprocessor([data[0]["image"]])
print(input["image"].shape)

output = model(input)
print(output)

torch.Size([1, 3, 224, 224])
ImageClassifierOutput(loss=None, logits=tensor([[-0.2819, -0.5155, -0.4322,  3.3778, -0.4935,  0.0070, -0.2550, -0.5460,
         -0.2329, -0.5659]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## List and Load Metrics

In [10]:
# List the first 20 metrics from TorchMetrics

list_metrics(provider="torchmetrics")[:20]

['Accuracy',
 'AUROC',
 'AveragePrecision',
 'BLEUScore',
 'CalibrationError',
 'CatMetric',
 'CharErrorRate',
 'CHRFScore',
 'ConcordanceCorrCoef',
 'CohenKappa',
 'ConfusionMatrix',
 'CosineSimilarity',
 'CramersV',
 'Dice',
 'TweedieDevianceScore',
 'ErrorRelativeGlobalDimensionlessSynthesis',
 'ExactMatch',
 'ExplainedVariance',
 'ExtendedEditDistance',
 'F1Score']

In [11]:
# List the first 20 metrics from TorchEval

list_metrics(provider="torcheval")[:20]

['AUC',
 'BinaryAccuracy',
 'BinaryAUPRC',
 'BinaryAUROC',
 'BinaryBinnedAUROC',
 'BinaryBinnedPrecisionRecallCurve',
 'BinaryConfusionMatrix',
 'BinaryF1Score',
 'BinaryNormalizedEntropy',
 'BinaryPrecision',
 'BinaryPrecisionRecallCurve',
 'BinaryRecall',
 'BinaryRecallAtFixedPrecision',
 'BLEUScore',
 'Cat',
 'ClickThroughRate',
 'HitRate',
 'Max',
 'Mean',
 'MeanSquaredError']

In [12]:
# Count number of metrics from each provider

print(f'Number of metrics in TorchMetrics: {len(list_metrics(provider="torchmetrics"))}')
print(f'Number of metrics in TorchEval: {len(list_metrics(provider="torcheval"))}')

Number of metrics in TorchMetrics: 79
Number of metrics in TorchEval: 50


In [13]:
# Configure collection of classification metrics (with class-specific results)

metrics = dict(
    accuracy_te=load_metric(provider="torcheval", metric_name="MulticlassAccuracy", num_classes=10, average="none"),
    accuracy_tm=load_metric(provider="torchmetrics", metric_name="Accuracy", task="multiclass", num_classes=10, average="none"),
)

In [14]:
 # Helper function for converting/displaying metrics dictionaries as dataframes 

import pandas as pd

def metrics_to_df(names, output):
    """ Converts metrics result dictionary to a dataframe. """
    results = dict(
        classes=list(names),
        accuracy_te=[output["accuracy_te"][i].item() for i in range(10)],
        accuracy_tm=[output["accuracy_tm"][i].item() for i in range(10)],
    )
    df = pd.DataFrame.from_dict(results)
    return df

## Evaluate Model

In [15]:
 # Create evaluator for image classification task

evaluator = evaluate(task="image-classification")

In [16]:
 # Select subset of test data (for purpose of demo)

import torch

torch.manual_seed(1234)
indices = torch.randperm(len(data))[:1024]
data_subset = torch.utils.data.Subset(data, indices)

len(data_subset)

1024

In [17]:
 # Run evaluation
 
 # Reset metrics
[m.reset() for m in metrics.values()]

output = evaluator(
    model,
    data_subset,
    metric=metrics,
    batch_size=32,
    device=0,
)

  0%|          | 0/32 [00:00<?, ?it/s]

In [18]:
# Print metrics

df = metrics_to_df(int2name, output)

print(f"Model accuracy (TorchEval) = {df.accuracy_te.mean():0.3f}")
print(f"Model accuracy (TorchMetrics) = {df.accuracy_tm.mean():0.3f}")

df.round(3)

Model accuracy (TorchEval) = 0.978
Model accuracy (TorchMetrics) = 0.978


Unnamed: 0,classes,accuracy_te,accuracy_tm
0,airplane,0.973,0.973
1,automobile,1.0,1.0
2,bird,0.991,0.991
3,cat,1.0,1.0
4,deer,0.989,0.989
5,dog,0.917,0.917
6,frog,0.966,0.966
7,horse,0.991,0.991
8,ship,0.991,0.991
9,truck,0.963,0.963
