# Evaluation Process

A evaluation system for comparing Large Language Models using bronze-silver-gold data architecture.

In [1]:
import pandas as pd
import ast
import json
from collections import Counter
import traceback

For this program we use pandas to manipulate de datasets, counters to help calculate the required metrics

In [2]:
names = ['flan_default', 'flan_treatment_mode', 'gemma3_default', 'gemma3_treatment_mode', 'llama3_default', 'llama3_treatment_mode', 
         'mistral_default', 'mistral_treatment_mode', 'qwen3_default', 'qwen3_treatment_mode']

In [3]:
def parse_cell(val):
    if pd.isna(val) or not str(val).strip():
        return None
    s = str(val).strip().replace('\\"', "'")
    
    for parser in [json.loads, ast.literal_eval]:
        try: 
            return parser(s)
        except: 
            pass
    
    s_clean = s
    if len(s) >= 6 and ((s[:3] == '"""' and s[-3:] == '"""') or (s[:3] == "'''" and s[-3:] == "'''")):
        s_clean = s[3:-3]
    elif len(s) >= 2 and ((s[0] == '"' and s[-1] == '"') or (s[0] == "'" and s[-1] == "'")):
        s_clean = s[1:-1]
    
    s_clean = s_clean.replace('""', '"')
    
    for parser in [json.loads, ast.literal_eval]:
        try: 
            return parser(s_clean)
        except: 
            pass
    
    return s_clean

The `parse_cell` function parse the contents of each cell from strings with have '"' misplaced, into python objects 

In [4]:
def normalize_value(x):
    if x is None: return None
    if isinstance(x, str): return x.strip().lower()
    if isinstance(x, float) and x.is_integer():return int(x)
    return x        

def normalize_structure(d):
    if d is None: return None
    if isinstance(d, dict): return {normalize_value(k): normalize_structure(v) for k, v in d.items()}
    if isinstance(d, list): return [normalize_structure(x) for x in d]
    return normalize_value(d)

The function `normalize_value` padronize values of text and number so they don't conflict or be calculated as different. And the `normalize_structure` function corrects and padronize dictionaries and lists, so they be equal and standardized.

In [5]:
def compare_values(proc, exp):
    proc_empty = not proc or (isinstance(proc, str) and not proc.strip())
    exp_empty = not exp or (isinstance(exp, str) and not exp.strip())

    if exp_empty and proc_empty: return "TN"
    if not exp_empty and not proc_empty: return "TP" if proc == exp else "FP"
    if not exp_empty and proc_empty: return "FN"
    return "FP"

The function `compare_values` compares the two values from the processed and expected side of the tables, and then return if the processed is a True Negative, True Positive, False positive and False Negative from the exected.

In [6]:
def calculate_metrics(counter):
    TP, FP, FN = counter.get("TP", 0), counter.get("FP", 0), counter.get("FN", 0)
    precision = TP / (TP + FP) if (FP + TP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return {"precision": precision, "recall": recall, "f1":f1}

In [7]:
def evaluate(df):
    pairs = [
        ("processed_intent", "expected_intent"),
        ("processed_class", "expected_class"), 
        ("processed_attributes", "expected_attributes"),
        ("processed_filter_attributes", "expected_filter_attributes"),
        ]
    
    counts = {p[0]: Counter() for p in pairs}
    global_counter = Counter()

    for _, row in df.iterrows():
        for proc_col, exp_col in pairs:
            try:
                proc_val = normalize_structure(parse_cell(row.get(proc_col)))
                exp_val = normalize_structure(parse_cell(row.get(exp_col)))
                res = compare_values(proc_val, exp_val)
                counts[proc_col][res] += 1
                global_counter[res] += 1
            except Exception as e:
                counts[proc_col]["FP"] += 1
                global_counter["FP"] += 1
                continue

    return counts, global_counter

In [8]:
def clean_dataset(df):
    df_clean = df.copy()
    
    attr_cols = [c for c in ["expected_attributes", "expected_filter_attributes", 
                            "processed_attributes", "processed_filter_attributes"] 
                if c in df_clean.columns]
    
    for col in attr_cols:
        df_clean[col] = df_clean[col].apply(lambda x: x.replace('\\"', "'") if isinstance(x, str) else x)
    
    critical_cols = [c for c in ['expected_intent', 'expected_class'] if c in df_clean.columns]
    for col in critical_cols:
        df_clean = df_clean[df_clean[col].notna()]
    
    string_cols = [ c for c in ['expected_intent', 'expected_class',
                                 'processed_intent', 'processed_class'] if c in df_clean.columns]
    for col in string_cols:
        df_clean[col] = df_clean[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    
    for col in attr_cols:
        df_clean[col] = df_clean[col].apply(lambda x: parse_cell(x) if isinstance(x, str) else x)
    
    return df_clean

In [9]:
def save_all_metrics(all_metrics):
    detailed_rows = []
    global_rows = []

    for name, (counts, global_counter) in all_metrics.items():
        for col, counter in counts.items():
            metrics = calculate_metrics(counter)
            detailed_rows.append({
                'model': name, 'column': col, **metrics, 'TP': counter['TP'],
                'FP': counter['FP'], 'FN': counter['FN'], 'TN': counter['TN']
            })

        global_metrics = calculate_metrics(global_counter)
        global_rows.append({
            'model': name, **global_metrics, 'TP': global_counter['TP'],
            'FP': global_counter['FP'], 'FN': global_counter['FN'], 'TN': global_counter['TN']
        })
        
    df1 = pd.DataFrame(detailed_rows)
    df2 = pd.DataFrame(global_rows)

    display(df1)
    display(df2)

    pd.DataFrame(detailed_rows).to_csv('results/all_detailed_metrics.csv', index=False)
    pd.DataFrame(global_rows).to_csv('results/all_global_metrics.csv', index=False)

In [10]:
def main():
    all_metrics = {}

    for name in names:
        try:
            df_bronze = pd.read_csv(f'datasets/bronze/{name}.csv', dtype=str)
            df_silver = clean_dataset(df_bronze)
            df_silver.to_csv(f'datasets/silver/{name}_cleaned.csv', index=False)
            counts, global_counter = evaluate(df_silver)
            all_metrics[name] = (counts, global_counter)
        except Exception as e:
            print(f"Error processing {name}: {e}")
            traceback.print_exc()
    
    if all_metrics:
        save_all_metrics(all_metrics)
    else:
        print("no models proecessed successfully")

if __name__ == "__main__":
    main()

Unnamed: 0,model,column,precision,recall,f1,TP,FP,FN,TN
0,flan_default,processed_intent,0.987931,0.947107,0.967089,573,7,32,0
1,flan_default,processed_class,0.358131,0.858921,0.505495,207,371,34,0
2,flan_default,processed_attributes,0.004158,0.051282,0.007692,2,479,37,94
3,flan_default,processed_filter_attributes,0.0,0.0,0.0,0,88,209,315
4,flan_treatment_mode,processed_intent,0.993913,0.967824,0.980695,1143,7,38,0
5,flan_treatment_mode,processed_class,0.729038,0.937571,0.820258,826,307,55,0
6,flan_treatment_mode,processed_attributes,0.491972,0.725888,0.586466,429,443,162,154
7,flan_treatment_mode,processed_filter_attributes,0.569767,0.092453,0.159091,49,37,481,621
8,gemma3_default,processed_intent,1.0,0.989899,0.994924,1176,0,12,0
9,gemma3_default,processed_class,0.932823,0.989179,0.960175,1097,79,12,0


Unnamed: 0,model,precision,recall,f1,TP,FP,FN,TN
0,flan_default,0.452808,0.714808,0.554413,782,945,312,409
1,flan_treatment_mode,0.755014,0.768772,0.761831,2447,794,736,775
2,gemma3_default,0.824897,0.767249,0.795029,2591,550,786,825
3,gemma3_treatment_mode,0.791421,0.771281,0.781221,2546,671,755,780
4,llama3_default,0.836701,0.61429,0.70845,2029,396,1274,1053
5,llama3_treatment_mode,0.795963,0.760929,0.778052,2524,647,793,788
6,mistral_default,0.604021,0.538405,0.569329,1472,965,1262,1053
7,mistral_treatment_mode,0.780403,0.760749,0.770451,2477,697,779,799
8,qwen3_default,0.465872,0.471887,0.46886,1133,1299,1268,1052
9,qwen3_treatment_mode,0.735621,0.742463,0.739026,2315,832,803,802
