### Compute accuracy on a given dataset

* This Notebook show how to compute the accuracy of a given model (for example from HF HUb) computed on a given dataset, provided as a csv file

In [1]:
import numpy as np
from datasets import load_dataset, concatenate_datasets, ClassLabel, Features, Value
from transformers import pipeline

[2023-11-25 13:17:15,795] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
FILE_NAME = "second_dataset.csv"

MODEL_NAME = "luigisaetta/sentiment_ita"

In [3]:
# all tools here
label_to_int = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

# for my custom dataset
def remove_non_breaking_space(example):
    # Assuming the text is under the key 'text'
    example['text'] = example['text'].replace('\xa0', ' ')
    return example

# the pipeline
classifier = pipeline("text-classification", model=MODEL_NAME)

In [4]:
# need to cast int to ClassLabel
features = Features({
    'text': Value('string'),
    'label': ClassLabel(names=['negative', 'neutral', 'positive'])
})

# remove the split and take (the only) train
DELIMITER = ","

my_csv_ds = load_dataset('csv', data_files=FILE_NAME, delimiter=DELIMITER, features=features)["train"]

# remove non breaking space
my_csv_ds = my_csv_ds.map(remove_non_breaking_space)

In [5]:
# have a look at one example
my_csv_ds[1]

{'text': "Buongiorno sono utente3 volevo chiedere un appuntamento con l'assessore assessore33 se possibile per la prossima settimana grazie Matteo",
 'label': 1}

In [6]:
batch_size = 32  # Define your batch size

correct = 0
total = 0

for i in range(0, len(my_csv_ds), batch_size):
    batch = my_csv_ds[i:i + batch_size]
    
    predictions = classifier(batch["text"])
    
    for idx, item in enumerate(predictions):
        total += 1
        predicted_label = label_to_int[predictions[idx]["label"]]
        true_label = batch['label'][idx]
        
        if true_label == predicted_label:
            correct += 1
            
accuracy = float(correct)/float(total)

print()
print(f"On the provided dataset, Accuracy is: {accuracy:.2f}")


On the provided dataset, Accuracy is: 1.00
