In [1]:
import os
import pandas as pd
import re
from dotenv import load_dotenv
import openai
from tqdm import tqdm
import time
import json
from ipywidgets import IntProgress
from IPython.display import display
from scipy import stats
from scikit_posthocs import posthoc_dunn
import numpy as np


In [2]:
# Define the path to the results directory
results_dir = "results"

# Function to extract model name from filename
def extract_model_name(filename):
    match = re.search(r"experimental_design_results_(.*)\.csv", filename)
    if match:
        return match.group(1)
    else:
        return "Unknown Model"

# Function to read CSV files from a directory, add 'model' column, and select specific columns
def read_and_label_csvs(directory):
    dfs = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath, encoding='utf-8')
            
                model_name = extract_model_name(filename)
                df['model'] = model_name  # Add model name as a column
                
                # Select specific columns
                df = df[['model', 'baseline', 'results']]
                
                dfs.append(df)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    return dfs

# Read and label CSV files from the results directory
df = read_and_label_csvs(results_dir)

In [3]:
# Specify the path to your .env file
dotenv_path = "/mnt/4d4f90e5-f220-481e-8701-f0a546491c35/arquivos/projetos/.env"

# Load the .env file
load_dotenv(dotenv_path=dotenv_path)

# Access and store the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")
model = 'gpt-4o-mini-2024-07-18'

In [4]:
# Initialize progress bar
progress_bar = IntProgress(min=0, max=len(df), description='Evaluating', bar_style='info')
display(progress_bar)

# Ensure df is a DataFrame
if isinstance(df, list):
    print("Converting list of DataFrames into a single DataFrame.")
    df = pd.concat(df, ignore_index=True)

# Iterate over each row of the DataFrame
for index, row in df.iterrows():
    if index % 150 == 0 and index != 0:
        print("min. pause...")
        time.sleep(60)
    
    baseline = row['baseline']
    result = row['results']
    temperature = 0.0
    
    comparison_promt = f"""Avalie as respostas abaixo de 0 a 10, em que 0 representa discordância total e 10 concordância total, considerando os seguintes critérios:

        *Qualidade da Resposta*: A resposta é clara, concisa e bem estruturada ao responder sobre endereços ou CEP?
        *Concordância*: A resposta faz sentido no contexto da pergunta ou instrução? Ela segue uma linha de raciocínio coerente?
        *Precisão/Acurácia*: A resposta apresenta informações corretas ou semelhantes comparadas com o baseline?
        **Alucinação**: A resposta contém informações inventadas ou inconsistentes com a pergunta ou baseline, ou afirmações sem embasamento verificável? Penalize com nota zero qualquer trecho que pareça com conteúdo falso ou fabricado.

        Resposta Base (Baseline): {baseline}
        Resposta do Modelo: {result}

        Atribua uma nota de 0 a 10 para a comparação entre as respostas (Baseline e Modelo), justificando brevemente sua avaliação. 
        Respostas do modelo que indicam falta de acesso a informações específicas e recomendam consultar fontes externas (ex: "Desculpe, mas não tenho acesso a informações específicas como CEPs...") devem ser penalizadas, pois não atingiram o objetivo 
        Retorne a avaliação no formato JSON, com as chaves "quality", "agreement", "accuracy", "hallucination" e "justification". 
        Não penalize a avaliação em caso de repetições no texto.
        """
    
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": comparison_promt}],
            temperature=temperature
        )
        generated_text = response.choices[0].message.content
        
        # Process the JSON response
        metadata_str = generated_text.strip()
        if metadata_str.startswith("```json") and metadata_str.endswith("```"):
            metadata_str = metadata_str.strip("```json").strip()
        
        try:
            data = json.loads(metadata_str)
            # Update DataFrame with the extracted values
            df.loc[index, 'quality'] = data.get('quality')
            df.loc[index, 'agreement'] = data.get('agreement')
            df.loc[index, 'accuracy'] = data.get('accuracy')
            df.loc[index, 'hallucination'] = data.get('hallucination')
            df.loc[index, 'justification'] = data.get('justification')
        
        except json.JSONDecodeError as e:
            # Print error message if JSON decoding fails
            print(f"Error decoding JSON: {e}")
            print(f"Problematic JSON string: {metadata_str}")
            df.loc[index, 'quality'] = None
            df.loc[index, 'agreement'] = None
            df.loc[index, 'accuracy'] = None
            df.loc[index, 'hallucination'] = None
            df.loc[index, 'justification'] = f"JSON Error: {e}"

    except Exception as e:
        # Print error message if there's an issue processing the line
        print(f"Error processing line {index}: {e}")
        df.loc[index, 'quality'] = None
        df.loc[index, 'agreement'] = None
        df.loc[index, 'accuracy'] = None
        df.loc[index, 'hallucination'] = None
        df.loc[index, 'justification'] = f"Error: {e}"
    
    progress_bar.value += 1
        
# Save the updated DataFrame
output_filename = "final_evaluation.csv"
df.to_csv(output_filename, index=False)
print(f"Completed. Results saved in {output_filename}")

IntProgress(value=0, bar_style='info', description='Evaluating', max=3)

Converting list of DataFrames into a single DataFrame.
min. pause...
Completed. Results saved in final_evaluation.csv


In [5]:
df = pd.read_csv("final_evaluation.csv")
df_final = df[['model', 'quality', 'agreement', 'accuracy', 'hallucination']].copy()

# Convert columns to numeric, handling errors with NaN
df_final.loc[:, 'quality'] = pd.to_numeric(df_final['quality'], errors='coerce')
df_final.loc[:, 'agreement'] = pd.to_numeric(df_final['agreement'], errors='coerce')
df_final.loc[:, 'accuracy'] = pd.to_numeric(df_final['accuracy'], errors='coerce')
df_final.loc[:, 'hallucination'] = pd.to_numeric(df_final['hallucination'], errors='coerce')

# Create the 'evaluation' column with the mean of the other three
df_final['evaluation'] = df_final[['quality', 'agreement', 'accuracy']].mean(axis=1)

print(df_final)


                      model  quality  agreement  accuracy  hallucination  \
0    gpt-4o-mini-2024-07-18      3.0        2.0       1.0            0.0   
1    gpt-4o-mini-2024-07-18      3.0        2.0       1.0            0.0   
2    gpt-4o-mini-2024-07-18      3.0        2.0       1.0            0.0   
3    gpt-4o-mini-2024-07-18      3.0        2.0       1.0            0.0   
4    gpt-4o-mini-2024-07-18      3.0        2.0       1.0            0.0   
..                      ...      ...        ...       ...            ...   
295          TeenyTinyLlama      0.0        0.0       0.0           10.0   
296          TeenyTinyLlama      1.0        1.0       0.0            0.0   
297          TeenyTinyLlama      0.0        0.0       0.0           10.0   
298          TeenyTinyLlama      0.0        0.0       0.0           10.0   
299          TeenyTinyLlama      2.0        2.0       1.0            0.0   

     evaluation  
0      2.000000  
1      2.000000  
2      2.000000  
3      2.000000

In [6]:
# H0: The medians of all groups are equal.
# H1: At least one group median is different from the others.


In [7]:
def bootstrap_mean_ci(data, n_iterations=1000, confidence_level=0.95):
    """Calculates the bootstrap confidence interval for the mean."""
    means = []
    for _ in range(n_iterations):
        sample = np.random.choice(data, size=len(data), replace=True)
        means.append(np.mean(sample))
    
    alpha = (1 - confidence_level) / 2
    lower_percentile = alpha * 100
    upper_percentile = (1 - alpha) * 100
    
    lower_bound = np.percentile(means, lower_percentile)
    upper_bound = np.percentile(means, upper_percentile)
    
    return lower_bound, upper_bound


In [8]:
quality = df_final.groupby('model')['quality'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Calculate bootstrap confidence intervals
quality['ci_lower'] = df_final.groupby('model')['quality'].apply(lambda x: bootstrap_mean_ci(x.dropna())[0])
quality['ci_upper'] = df_final.groupby('model')['quality'].apply(lambda x: bootstrap_mean_ci(x.dropna())[1])


# Rename columns for better clarity
summary_quality = quality.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Print the summary DataFrame
print("Summary Quality:")
display(summary_quality)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['quality'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='quality', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Quality:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.29,0.064031,0.0,2.0,0.0,0.17,0.43
TeenyTinyLlama-160m-CEP-ft,1.93,0.084393,0.0,4.0,2.0,1.76,2.09
gpt-4o-mini-2024-07-18,2.89,0.031447,2.0,3.0,3.0,2.82,2.95



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(225.0926659994187), pvalue=np.float64(1.3235751958804944e-49)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,7.576527e-15,2.358944e-50
TeenyTinyLlama-160m-CEP-ft,7.576527e-15,1.0,4.2294e-12
gpt-4o-mini-2024-07-18,2.358944e-50,4.2294e-12,1.0


In [9]:
agreement = df_final.groupby('model')['agreement'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Calculate bootstrap confidence intervals
agreement['ci_lower'] = df_final.groupby('model')['agreement'].apply(lambda x: bootstrap_mean_ci(x.dropna())[0])
agreement['ci_upper'] = df_final.groupby('model')['agreement'].apply(lambda x: bootstrap_mean_ci(x.dropna())[1])

# Rename columns for better clarity
summary_agreement = agreement.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Print the summary DataFrame
print("Summary Agreement:")
display(summary_agreement)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['agreement'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='agreement', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Agreement:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.28,0.063691,0.0,3.0,0.0,0.17,0.42
TeenyTinyLlama-160m-CEP-ft,1.88,0.09458,0.0,5.0,2.0,1.71975,2.07025
gpt-4o-mini-2024-07-18,2.4,0.07521,2.0,4.0,2.0,2.25,2.53025



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(188.59433920501024), pvalue=np.float64(1.1149607586431714e-41)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,1.2344700000000002e-22,2.9267480000000003e-39
TeenyTinyLlama-160m-CEP-ft,1.2344700000000002e-22,1.0,0.00299659
gpt-4o-mini-2024-07-18,2.9267480000000003e-39,0.00299659,1.0


In [10]:
accuracy = df_final.groupby('model')['accuracy'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Calculate bootstrap confidence intervals
accuracy['ci_lower'] = df_final.groupby('model')['accuracy'].apply(lambda x: bootstrap_mean_ci(x.dropna())[0])
accuracy['ci_upper'] = df_final.groupby('model')['accuracy'].apply(lambda x: bootstrap_mean_ci(x.dropna())[1])

# Rename columns for better clarity
summary_accuracy = accuracy.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Print the summary DataFrame
print("Summary Accuracy:")
display(summary_accuracy)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['accuracy'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='accuracy', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Accuracy:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.08,0.027266,0.0,1.0,0.0,0.03,0.14
TeenyTinyLlama-160m-CEP-ft,0.95,0.071598,0.0,5.0,1.0,0.81,1.09025
gpt-4o-mini-2024-07-18,1.14,0.040252,0.0,2.0,1.0,1.06,1.22



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(176.61026509508514), pvalue=np.float64(4.4623970607703897e-39)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,1.217388e-22,4.513025e-36
TeenyTinyLlama-160m-CEP-ft,1.217388e-22,1.0,0.01933945
gpt-4o-mini-2024-07-18,4.513025e-36,0.01933945,1.0


In [11]:
mean_evaluation = df_final.groupby('model')['evaluation'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Calculate bootstrap confidence intervals
mean_evaluation['ci_lower'] = df_final.groupby('model')['evaluation'].apply(lambda x: bootstrap_mean_ci(x.dropna())[0])
mean_evaluation['ci_upper'] = df_final.groupby('model')['evaluation'].apply(lambda x: bootstrap_mean_ci(x.dropna())[1])

# Rename columns for better clarity
summary_mean_evaluation = mean_evaluation.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Print the summary DataFrame
print("Summary Mean Evaluation:")
display(summary_mean_evaluation)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['evaluation'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='evaluation', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Mean Evaluation:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.216667,0.05,0.0,2.0,0.0,0.123333,0.326667
TeenyTinyLlama-160m-CEP-ft,1.586667,0.077723,0.0,4.0,1.666667,1.43325,1.74
gpt-4o-mini-2024-07-18,2.143333,0.038854,1.333333,3.0,2.0,2.07,2.226667



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(219.85266905800222), pvalue=np.float64(1.818023523177442e-48)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,1.789714e-14,3.242604e-49
TeenyTinyLlama-160m-CEP-ft,1.789714e-14,1.0,6.860995e-12
gpt-4o-mini-2024-07-18,3.242604e-49,6.860995e-12,1.0


In [12]:
hallucination = df_final.groupby('model')['hallucination'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Calculate bootstrap confidence intervals
hallucination['ci_lower'] = df_final.groupby('model')['hallucination'].apply(lambda x: bootstrap_mean_ci(x.dropna())[0])
hallucination['ci_upper'] = df_final.groupby('model')['hallucination'].apply(lambda x: bootstrap_mean_ci(x.dropna())[1])

# Rename columns for better clarity
summary_hallucination = hallucination.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Print the summary DataFrame
print("Summary hallucination:")
display(summary_hallucination)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['hallucination'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='hallucination', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary hallucination:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,8.5,0.35887,0.0,10.0,10.0,7.7,9.2
TeenyTinyLlama-160m-CEP-ft,1.2,0.326599,0.0,10.0,0.0,0.6,1.8
gpt-4o-mini-2024-07-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(192.81407749733395), pvalue=np.float64(1.3519326443107226e-42)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,9.474597e-28,3.418742e-37
TeenyTinyLlama-160m-CEP-ft,9.474597e-28,1.0,0.2104029
gpt-4o-mini-2024-07-18,3.418742e-37,0.2104029,1.0


In [13]:
print("Summary Quality:")
display(summary_quality)

print("Summary Agreement:")
display(summary_agreement)

print("Summary Accuracy:")
display(summary_accuracy)

print("Summary Mean Evaluation: (quality + agreement + acurracy) / 3")
display(summary_mean_evaluation)

print("Summary hallucination:")
display(summary_hallucination)


Summary Quality:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.29,0.064031,0.0,2.0,0.0,0.17,0.43
TeenyTinyLlama-160m-CEP-ft,1.93,0.084393,0.0,4.0,2.0,1.76,2.09
gpt-4o-mini-2024-07-18,2.89,0.031447,2.0,3.0,3.0,2.82,2.95


Summary Agreement:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.28,0.063691,0.0,3.0,0.0,0.17,0.42
TeenyTinyLlama-160m-CEP-ft,1.88,0.09458,0.0,5.0,2.0,1.71975,2.07025
gpt-4o-mini-2024-07-18,2.4,0.07521,2.0,4.0,2.0,2.25,2.53025


Summary Accuracy:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.08,0.027266,0.0,1.0,0.0,0.03,0.14
TeenyTinyLlama-160m-CEP-ft,0.95,0.071598,0.0,5.0,1.0,0.81,1.09025
gpt-4o-mini-2024-07-18,1.14,0.040252,0.0,2.0,1.0,1.06,1.22


Summary Mean Evaluation: (quality + agreement + acurracy) / 3


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.216667,0.05,0.0,2.0,0.0,0.123333,0.326667
TeenyTinyLlama-160m-CEP-ft,1.586667,0.077723,0.0,4.0,1.666667,1.43325,1.74
gpt-4o-mini-2024-07-18,2.143333,0.038854,1.333333,3.0,2.0,2.07,2.226667


Summary hallucination:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,8.5,0.35887,0.0,10.0,10.0,7.7,9.2
TeenyTinyLlama-160m-CEP-ft,1.2,0.326599,0.0,10.0,0.0,0.6,1.8
gpt-4o-mini-2024-07-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0
