### libraries

In [2]:
import os
os.environ['TRANSFORMERS_CACHE'] = '../hfcache'
os.environ['HF_HOME'] = '../hfcache'
os.environ["CUDA_VISIBLE_DEVICES"] = '4'

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
import pandas as pd
import numpy as np
import torch, gc, pprint

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from evaluate import load

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
cache_dir = '../hfcache'
data_dir = "../data"

### evaluation

In [6]:
model_names = {
    "nvidia/Llama3-ChatQA-1.5-8B": "nvidia_llama3_8b",  # context 8k
    "Qwen/Qwen2.5-7B-Instruct": "qwen2.5_7b",  # context 32k
    "microsoft/Phi-3.5-mini-instruct": "phi3.5_mini",  # context 128k
    "mistralai/Mistral-Nemo-Instruct-2407": "mistral_nemo_12b"  # context 128k
}

In [7]:
# bleurt
model = AutoModelForSequenceClassification.from_pretrained("Elron/bleurt-large-512", cache_dir=cache_dir, device_map=device)
tokenizer = AutoTokenizer.from_pretrained("Elron/bleurt-large-512", cache_dir=cache_dir, device_map=device)
model.eval()

bertscore = load("bertscore", module_type="metric", )

config.json:   0%|          | 0.00/780 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/322 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
results_score = {}
for model_name, title in model_names.items():
    path = os.path.join(data_dir, f"{title}_gen.csv")
    data = pd.read_csv(path, )
    
    bertscore_results = bertscore.compute(predictions=data["Gen"].tolist(), references=data["True"].tolist(), lang="en", )
    
    with torch.no_grad():
        inputs = tokenizer(data["True"].tolist(), data["Gen"].tolist(),
                           return_tensors='pt', padding=True, truncation=True).to(device)
        scores = model(**inputs)[0].squeeze()
    
    results_score[title] = {
        'precision': np.mean(bertscore_results['precision']).item(),
        'recall': np.mean(bertscore_results['recall']).item(),
        'f1': np.mean(bertscore_results['f1']).item(),
        'bleurt': scores.mean().item()
    }

    del inputs, scores
    gc.collect()
    torch.cuda.empty_cache()

In [13]:
pprint.pprint(results_score)

{'mistral_nemo_12b': {'bleurt': -0.08434377610683441,
                      'f1': 0.8664473816752434,
                      'precision': 0.8384463673830033,
                      'recall': 0.8969514963030815},
 'nvidia_llama3_8b': {'bleurt': -0.18476881086826324,
                      'f1': 0.9070042352378368,
                      'precision': 0.9294992500543594,
                      'recall': 0.8864519880712032},
 'phi3.5_mini': {'bleurt': 0.00402071001008153,
                 'f1': 0.8730380964279175,
                 'precision': 0.8471604603528976,
                 'recall': 0.9009352257847786},
 'qwen2.5_7b': {'bleurt': -0.058741774410009384,
                'f1': 0.8638190342485905,
                'precision': 0.8339289693534374,
                'recall': 0.8964014400541782}}
