#### Run evaluation on existing small unsloth models

In [14]:
import os
from dotenv import load_dotenv

load_dotenv()

hf_token = os.getenv("HF_TOKEN", "")
os.environ["HF_TOKEN"] = hf_token


In [13]:
from lm_eval import evaluator
import torch

res = evaluator.simple_evaluate(
    batch_size = 64,
    model = "hf",
    model_args = "pretrained=google/gemma-3-1b-it",
    #model_args = "pretrained=mistralai/Mistral-7B-Instruct-v0.3,dtype=float16",
    apply_chat_template=True,
    #tasks=["_ro_winogrande","_ro_belebele","ro_wiki","_ro_mmlu"], 
    tasks=["arc_challenge","winogrande"],
    device = 'auto',
    #dtype = torch.float16,
    limit = 1,
    #verbosity = "DEBUG",
    #log_samples=True,       # Ensure this is True
    #write_out=True,        # Save model inputs/outputs to disk for inspection
    num_fewshot = 5
)

2025-11-08:18:51:58,293 INFO     [evaluator.py:158] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2025-11-08:18:51:58,293 INFO     [evaluator.py:195] Initializing hf model, with arguments: {'pretrained': 'google/gemma-3-1b-it'}
2025-11-08:18:51:58,294 INFO     [huggingface.py:178] Device not specified
2025-11-08:18:51:58,294 INFO     [huggingface.py:179] Cuda Available? False
2025-11-08:18:52:00,061 INFO     [__init__.py:491] `group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. `tag` will be used to allow to call a collection of tasks just like `group`. `group` will be removed in order to not cause confusion with the new ConfigurableGroup which will be the offical way to create groups with addition of group-wide configuations.
2025-11-08:18:52:01,984 INFO     [__init__.py:512] The tag xnli is already registered as a group, this tag will not be registered. This may affect tasks you want to cal

In [12]:
res

{'results': {'arc_challenge': {'alias': 'arc_challenge',
   'acc,none': 0.5,
   'acc_stderr,none': 0.5,
   'acc_norm,none': 0.0,
   'acc_norm_stderr,none': 0.0}},
 'group_subtasks': {'arc_challenge': []},
 'configs': {'arc_challenge': {'task': 'arc_challenge',
   'tag': ['ai2_arc'],
   'dataset_path': 'allenai/ai2_arc',
   'dataset_name': 'ARC-Challenge',
   'training_split': 'train',
   'validation_split': 'validation',
   'test_split': 'test',
   'doc_to_text': 'Question: {{question}}\nAnswer:',
   'doc_to_target': '{{choices.label.index(answerKey)}}',
   'doc_to_choice': '{{choices.text}}',
   'description': '',
   'target_delimiter': ' ',
   'fewshot_delimiter': '\n\n',
   'num_fewshot': 5,
   'metric_list': [{'metric': 'acc',
     'aggregation': 'mean',
     'higher_is_better': True},
    {'metric': 'acc_norm', 'aggregation': 'mean', 'higher_is_better': True}],
   'output_type': 'multiple_choice',
   'repeats': 1,
   'should_decontaminate': True,
   'doc_to_decontamination_query':

In [18]:
import pandas as pd
from typing import Dict, Any
import re

def extract_final_answer(answer_text: str) -> str:
    """Extract the final numerical answer from GSM8K format (after ####)"""
    match = re.search(r'####\s*(\-?[\d\.\,]+)', answer_text)
    if match:
        return match.group(1).strip()
    return "N/A"

def print_evaluation_results(res: Dict[str, Any]):
    """Pretty print evaluation results"""
    
    print("=" * 80)
    print("EVALUATION RESULTS SUMMARY")
    print("=" * 80)
    
    # Print overall metrics
    print("\nðŸ“Š Overall Metrics:")
    print("-" * 80)
    for task_name, metrics in res['results'].items():
        print(f"\nðŸ”¹ Task: {task_name}")
        for metric_name, value in metrics.items():
            if metric_name != 'alias':
                print(f"  â€¢ {metric_name}: {value:.4f}" if isinstance(value, (float, int)) else f"  â€¢ {metric_name}: {value}")
    
    # Print sample counts
    print("\n\nðŸ“ˆ Sample Statistics:")
    print("-" * 80)
    for task_name, counts in res['n-samples'].items():
        print(f"ðŸ”¹ {task_name}: {counts['effective']} / {counts['original']} samples evaluated")
    
    print("\n" + "=" * 80)


def create_evaluation_dataframes(res: Dict[str, Any]) -> Dict[str, pd.DataFrame]:
    """Convert evaluation results to task-specific DataFrames"""
    
    dataframes = {}
    
    for task_name, samples in res['samples'].items():
        task_config = res['configs'][task_name]
        rows = []
        
        for sample in samples:
            # Common fields
            row = {
                'doc_id': sample['doc_id'],
                'dataset': task_config.get('dataset_path', 'N/A'),
                'num_fewshot': res['n-shot'][task_name],
            }
            
            # Task-specific fields
            if 'question' in sample['doc']:  # GSM8K
                target_answer = sample['target']
                correct_answer = extract_final_answer(target_answer)
                model_response = str(sample.get('filtered_resps', [''])[0] if sample.get('filtered_resps') else '')
                
                row.update({
                    'question': sample['doc']['question'],
                    'target_answer': target_answer,
                    'correct_answer': correct_answer,
                    'model_response': model_response,
                    'exact_match': sample.get('exact_match', None),
                })
                
            elif 'sentence' in sample['doc']:  # Winogrande
                log_probs = [r[0][0] for r in sample['resps']] if 'resps' in sample else []
                model_choice = log_probs.index(max(log_probs)) + 1 if log_probs else None
                
                # Determine which option is correct and check if model generated it
                correct_answer_num = int(sample['doc']['answer'])
                correct_option = sample['doc'][f'option{correct_answer_num}']
                
                # Check if the model actually generated/chose the correct words
                model_generated_correct = (model_choice == correct_answer_num) if model_choice else False
                
                row.update({
                    'sentence': sample['doc']['sentence'],
                    'option1': sample['doc']['option1'],
                    'option2': sample['doc']['option2'],
                    'correct_answer': correct_answer_num,
                    'correct_option_text': correct_option,
                    'model_choice': model_choice,
                    'model_chose_correct': model_generated_correct,
                    'accuracy': sample.get('acc', None),
                    'log_prob_option1': log_probs[0] if len(log_probs) > 0 else None,
                    'log_prob_option2': log_probs[1] if len(log_probs) > 1 else None,
                })
            
            rows.append(row)
        
        dataframes[task_name] = pd.DataFrame(rows)
    
    return dataframes


# Usage:
print_evaluation_results(res)

# Create task-specific DataFrames
dfs = create_evaluation_dataframes(res)

# Display and save each DataFrame
for task_name, df in dfs.items():
    print(f"\n\n{'='*80}")
    print(f"ðŸ“Š DATAFRAME FOR TASK: {task_name}")
    print("=" * 80)
    display(df)
    
    # Save to CSV
    output_file = f'evaluation_results_{task_name}.csv'
    df.to_csv(output_file, index=False)
    print(f"\nâœ… Results saved to: {output_file}")
    
    # Show summary statistics
    print(f"\nðŸ“ˆ SUMMARY STATISTICS FOR {task_name}:")
    print("-" * 80)
    
    if 'exact_match' in df.columns:
        print(f"Mean Exact Match: {df['exact_match'].mean():.4f}")
        print(f"Total Correct: {df['exact_match'].sum()} / {len(df)}")
    elif 'accuracy' in df.columns:
        print(f"Mean Accuracy: {df['accuracy'].mean():.4f}")
        print(f"Total Correct: {df['accuracy'].sum()} / {len(df)}")
    
    if 'model_chose_correct' in df.columns:
        print(f"Model Chose Correct: {df['model_chose_correct'].sum()} / {len(df)}")
    
    print(f"Total Samples: {len(df)}")

# Access individual DataFrames
print("\n\n" + "=" * 80)
print("ðŸ“¦ Available DataFrames:")
print("=" * 80)
for task_name in dfs.keys():
    print(f"  â€¢ dfs['{task_name}'] - {len(dfs[task_name])} samples")

EVALUATION RESULTS SUMMARY

ðŸ“Š Overall Metrics:
--------------------------------------------------------------------------------

ðŸ”¹ Task: _ro_gsm8k
  â€¢ exact_match,strict-match: 0.0000
  â€¢ exact_match_stderr,strict-match: 0.0000

ðŸ”¹ Task: _ro_winogrande
  â€¢ acc,none: 0.7000
  â€¢ acc_stderr,none: 0.1528


ðŸ“ˆ Sample Statistics:
--------------------------------------------------------------------------------
ðŸ”¹ _ro_gsm8k: 10 / 1319 samples evaluated
ðŸ”¹ _ro_winogrande: 10 / 1267 samples evaluated



ðŸ“Š DATAFRAME FOR TASK: _ro_gsm8k


Unnamed: 0,doc_id,dataset,num_fewshot,question,target_answer,correct_answer,model_response,exact_match
0,0,OpenLLM-Ro/ro_gsm8k,5,RaÈ›ele lui Janet depun 16 ouÄƒ pe zi. Ea mÄƒnÃ¢nc...,Janet vinde 16 - 3 - 4 = <<16-3-4=9>>9 ouÄƒ de ...,18,[invalid],0.0
1,1,OpenLLM-Ro/ro_gsm8k,5,O robÄƒ ia 2 bolÈ›uri de fibrÄƒ albastrÄƒ È™i jumÄƒt...,Este nevoie de 2/2=<<2/2=1>> 1 bolÈ› de fibre a...,3,2,0.0
2,2,OpenLLM-Ro/ro_gsm8k,5,Josh decide sÄƒ Ã®ncerce sÄƒ rÄƒstoarne o casÄƒ. E...,Costul casei È™i reparaÈ›ii a ieÈ™it la 80.000+50...,70000,[invalid],0.0
3,3,OpenLLM-Ro/ro_gsm8k,5,James decide sÄƒ alerge 3 sprinturi de 3 ori pe...,"El sprinteazÄƒ de 3*3=<<3*3=9>>9 ori\nDeci, el ...",540,[invalid],0.0
4,4,OpenLLM-Ro/ro_gsm8k,5,"ÃŽn fiecare zi, Wendi hrÄƒneÈ™te fiecare dintre p...",DacÄƒ fiecare pui mÄƒnÃ¢ncÄƒ 3 ceÈ™ti de hranÄƒ pe z...,20,[invalid],0.0
5,5,OpenLLM-Ro/ro_gsm8k,5,Kylar s-a dus la magazin sÄƒ cumpere ochelari p...,PreÈ›ul de discount al unui pahar este 60/100 *...,64,[invalid],0.0
6,6,OpenLLM-Ro/ro_gsm8k,5,Toulouse are de douÄƒ ori mai multe oi decÃ¢t Ch...,"DacÄƒ Seattle are 20 de oi, Charleston are 4 * ...",260,[invalid],0.0
7,7,OpenLLM-Ro/ro_gsm8k,5,Carla descarcÄƒ un fiÈ™ier de 200 GB. ÃŽn mod nor...,Mai Ã®ntÃ¢i gÄƒsiÈ›i cÃ¢te gigaocteÈ›i sunt Ã®n 40% d...,160,[invalid],0.0
8,8,OpenLLM-Ro/ro_gsm8k,5,John conduce timp de 3 ore la o vitezÄƒ de 60 m...,CÃ¢nd s-a Ã®ntors era 3*60=<<3*60=180>>180 mile ...,45,[invalid],0.0
9,9,OpenLLM-Ro/ro_gsm8k,5,Rata Elizei pe orÄƒ pentru primele 40 de ore Ã®n...,Eliza are dreptul la 45 -40 = <<45-40=5>> 5 or...,460,[invalid],0.0



âœ… Results saved to: evaluation_results__ro_gsm8k.csv

ðŸ“ˆ SUMMARY STATISTICS FOR _ro_gsm8k:
--------------------------------------------------------------------------------
Mean Exact Match: 0.0000
Total Correct: 0.0 / 10
Total Samples: 10


ðŸ“Š DATAFRAME FOR TASK: _ro_winogrande


Unnamed: 0,doc_id,dataset,num_fewshot,sentence,option1,option2,correct_answer,correct_option_text,model_choice,model_chose_correct,accuracy,log_prob_option1,log_prob_option2
0,0,OpenLLM-Ro/ro_winogrande,5,Sarah a fost un chirurg mult mai bun decÃ¢t Mar...,Sarah,Maria,2,Maria,1,False,0.0,-55.0,-58.5
1,1,OpenLLM-Ro/ro_winogrande,5,Sarah a fost un chirurg mult mai bun decÃ¢t Mar...,Sarah,Maria,1,Sarah,1,True,1.0,-52.75,-56.25
2,2,OpenLLM-Ro/ro_winogrande,5,Erau Ã®ngrijoraÈ›i cÄƒ vinul va strica patul È™i p...,pÄƒturÄƒ,pat,2,pat,1,False,0.0,-40.75,-44.25
3,3,OpenLLM-Ro/ro_winogrande,5,Terry a Ã®ncercat sÄƒ coacÄƒ vinetele Ã®n cuptorul...,vÃ¢nÄƒtÄƒ,prÄƒjitor de pÃ¢ine,1,vÃ¢nÄƒtÄƒ,2,False,0.0,-29.625,-28.625
4,4,OpenLLM-Ro/ro_winogrande,5,"Noaptea, Jeffrey stÄƒ mereu treaz mai tÃ¢rziu de...",Jeffrey,VÃ¢nÄƒtor,1,Jeffrey,1,True,1.0,-34.75,-40.0
5,5,OpenLLM-Ro/ro_winogrande,5,"Pisica lui Sarah are niÈ™te probleme cu gura, a...",Sarah,Maria,1,Sarah,1,True,1.0,-48.25,-49.5
6,6,OpenLLM-Ro/ro_winogrande,5,Casa pe care o aveau pÄƒrinÈ›ii mei cÃ¢nd eram la...,domiciliu,casÄƒ,1,domiciliu,1,True,1.0,-35.75,-37.75
7,7,OpenLLM-Ro/ro_winogrande,5,Casa pe care o aveau pÄƒrinÈ›ii mei cÃ¢nd eram la...,domiciliu,casÄƒ,2,casÄƒ,2,True,1.0,-43.0,-42.75
8,8,OpenLLM-Ro/ro_winogrande,5,"Natalie are un soÈ› bogat È™i o mulÈ›ime de bani,...",Natalie,Jennifer,2,Jennifer,2,True,1.0,-48.0,-45.75
9,9,OpenLLM-Ro/ro_winogrande,5,Joe s-a dus imediat la brutÄƒrie Ã®n faÈ›a bÄƒncii...,brutÄƒrie,bancÄƒ,1,brutÄƒrie,1,True,1.0,-56.0,-58.5



âœ… Results saved to: evaluation_results__ro_winogrande.csv

ðŸ“ˆ SUMMARY STATISTICS FOR _ro_winogrande:
--------------------------------------------------------------------------------
Mean Accuracy: 0.7000
Total Correct: 7.0 / 10
Model Chose Correct: 7 / 10
Total Samples: 10


ðŸ“¦ Available DataFrames:
  â€¢ dfs['_ro_gsm8k'] - 10 samples
  â€¢ dfs['_ro_winogrande'] - 10 samples


## Evaluation Summary Gemma31B - PT

ðŸ“Š Overall Metrics:
--------------------------------------------------------------------------------

ðŸ”¹ Task: _ro_gsm8k
  â€¢ exact_match,strict-match: 0.0000
  â€¢ exact_match_stderr,strict-match: 0.0000

ðŸ”¹ Task: _ro_winogrande
  â€¢ acc,none: 0.5000
  â€¢ acc_stderr,none: 0.1667


ðŸ“ˆ Sample Statistics:
--------------------------------------------------------------------------------
ðŸ”¹ _ro_gsm8k: 10 / 1319 samples evaluated
ðŸ”¹ _ro_winogrande: 10 / 1267 samples evaluated

================================================================================

## EVALUATION RESULTS SUMMARY - Phi3 Mini

ðŸ“Š Overall Metrics:
--------------------------------------------------------------------------------

ðŸ”¹ Task: _ro_gsm8k
  â€¢ exact_match,strict-match: 0.0500
  â€¢ exact_match_stderr,strict-match: 0.0060

ðŸ”¹ Task: _ro_winogrande
  â€¢ acc,none: 0.5091
  â€¢ acc_stderr,none: 0.0141


ðŸ“ˆ Sample Statistics:
--------------------------------------------------------------------------------
ðŸ”¹ _ro_gsm8k: 1319 / 1319 samples evaluated
ðŸ”¹ _ro_winogrande: 1267 / 1267 samples evaluated

================================================================================


================================================================================
ðŸ“Š DATAFRAME FOR TASK: _ro_gsm8k
================================================================================