# Evaluate

Here is good description of the module: https://huggingface.co/docs/evaluate/a_quick_tour

In [1]:
import evaluate

2024-05-21 10:37:34.235428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 10:37:34.235511: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 10:37:34.324996: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-21 10:37:34.478314: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
evaluate.list_evaluation_modules(include_community=False)

[]

## Load

In [3]:
# loading

acc = evaluate.load("accuracy")

In [4]:
# one can show the doc of this method:
# print(acc.__doc__)
# Or by calling the members, one can show the doc as well 

# print the description

print(acc.description)

# print the usage

print(acc.inputs_description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative


Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2,

## Usage

In [5]:
# for pairs of comparisons

acc = evaluate.load("accuracy")
ref = [0, 1, 2, 0, 1, 2]
pre = [0, 1, 1, 2, 1, 0]

for r, p in zip(ref, pre) :
    print(r, p)
    acc.add(reference=r, prediction=p)

print(acc.compute())


# for batched results

acc = evaluate.load("accuracy")
refs = [[0, 1, 2, 0, 1, 2], [0, 1, 2, 0, 1, 2]]
pres = [[0, 1, 1, 2, 1, 0], [0, 1, 1, 2, 1, 0]]

for r, p in zip(refs, pres) :
    print(r, p)
    acc.add_batch(references=r, predictions=p)

print(acc.compute())

# notes: function "add" accept iether "reference" and "references" as vairiable name
# but not "add_batch"

0 0
1 1
2 1
0 2
1 1
2 0
{'accuracy': 0.5}
[0, 1, 2, 0, 1, 2] [0, 1, 1, 2, 1, 0]
[0, 1, 2, 0, 1, 2] [0, 1, 1, 2, 1, 0]
{'accuracy': 0.5}


In [6]:
# combine several criteria

metrics = evaluate.combine(["accuracy", "recall", "f1", "precision"])

In [7]:
ref = [0, 1, 1, 0, 1, 1]
pre = [0, 1, 1, 1, 1, 0]
metrics.compute(references=ref, predictions=pre)

{'accuracy': 0.6666666666666666, 'recall': 0.75, 'f1': 0.75, 'precision': 0.75}

In [None]:
# evaluate provides also a way to visualize the results for comparison

# from evaluate.visualization import radar_plot
# data = [{"accuracy": 0.8, "precision": 0.7, "f1": 0.6, "latency_in_seconds": 10}, ...]
# models = ["model1", ...]
# plot = radar_plot(data=data, model_names=models)

## Application to the translation model

A detailed description of the score : https://huggingface.co/spaces/evaluate-metric/bleu

In [8]:
"""
 simple usage
"""

# load

blue = evaluate.load("bleu")

# compute

ref = ["hello there general kenobi", "foo bar foobar"]
pre = ["hello there general kenobi", "hello there !"]

res = blue.compute(references=ref, predictions=pre)

# show result

print(res)


{'bleu': 0.691441569283882, 'precisions': [0.5714285714285714, 0.6, 0.6666666666666666, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 7, 'reference_length': 7}
