In [2]:
import os
import pandas as pd
import re

# Define the path to the results directory
results_dir = "results"

# Function to extract model name from filename
def extract_model_name(filename):
    match = re.search(r"experimental_design_results_(.*)\.csv", filename)
    if match:
        return match.group(1)
    else:
        return "Unknown Model"

# Function to read CSV files from a directory, add 'model' column, and select specific columns
def read_and_label_csvs(directory):
    dfs = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath, encoding='utf-8')
            
                model_name = extract_model_name(filename)
                df['model'] = model_name  # Add model name as a column
                
                # Select specific columns
                df = df[['model', 'baseline', 'results']]
                
                dfs.append(df)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    return dfs

# Read and label CSV files from the results directory
df = read_and_label_csvs(results_dir)

In [3]:
import os
from dotenv import load_dotenv
import openai
from tqdm import tqdm
import time

# Specify the path to your .env file
dotenv_path = "/mnt/4d4f90e5-f220-481e-8701-f0a546491c35/arquivos/projetos/.env"

# Load the .env file
load_dotenv(dotenv_path=dotenv_path)

# Access and store the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")
model = 'gpt-4o-mini-2024-07-18'

In [4]:
import json
from ipywidgets import IntProgress
from IPython.display import display
import pandas as pd

# Initialize progress bar
progress_bar = IntProgress(min=0, max=len(df), desc='Avaliando', bar_style='info')
display(progress_bar)

# Ensure df is a pandas DataFrame
if isinstance(df, list):
    print("Converting list of DataFrames to a single DataFrame.")
    df = pd.concat(df, ignore_index=True)

# Iterate over each row and make API call
for index, row in df.iterrows():
    if index % 100 == 0 and index != 0:
        print("min. pause...")
        time.sleep(60)
    baseline = row['baseline']
    result = row['results']
    temperature = 0.0
    comparison_promt = f"""Avalie as respostas abaixo de 0 a 10, considerando os seguintes critérios:

        *Qualidade da Resposta*: A resposta é clara, concisa e bem estruturada? Ela aborda diretamente a questão ou tarefa proposta?
        *Concordância*: A resposta faz sentido no contexto da pergunta ou instrução? Ela segue uma linha de raciocínio coerente?
        *Precisão/Acurácia*: A resposta apresenta informações corretas e verificáveis comparadas com o baseline? Ela evita informações falsas ou enganosas?

        Resposta Base (Baseline): {baseline}
        Resposta do Modelo: {result}

        Atribua uma nota de 0 a 10 para a comparação entre as respostas (Baseline e Modelo), justificando brevemente sua avaliação. Retorne a avaliação no formato JSON, com as chaves "quality", "agreement", "accuracy" e "justification". Não penalize a avaliação em caso de repetições no texto.
        """
        
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": comparison_promt}],
            temperature=temperature
        )
        generated_text = response.choices[0].message.content
        
        # Load the JSON string
        metadata_str = generated_text.strip()
        if metadata_str.startswith("```json") and metadata_str.endswith("```"):
            metadata_str = metadata_str.strip("```").strip()
        if metadata_str.startswith("json"):
            metadata_str = metadata_str[4:].lstrip()
        try:
            data = json.loads(metadata_str)
            # Update DataFrame with the extracted values
            df.loc[index, 'quality'] = data.get('quality')
            df.loc[index, 'agreement'] = data.get('agreement')
            df.loc[index, 'accuracy'] = data.get('accuracy')
            df.loc[index, 'justification'] = data.get('justification')
            
        except json.JSONDecodeError as e:
            print(f"Erro ao decodificar JSON: {e}")
            print(f"String JSON com problema: {metadata_str}")
            # Optionally, store the error message in the DataFrame
            df.loc[index, 'quality'] = None
            df.loc[index, 'agreement'] = None
            df.loc[index, 'accuracy'] = None
            df.loc[index, 'justification'] = f"Erro de JSON: {e}"

    except Exception as e:
        print(f"Erro ao processar a linha {index}: {e}")
        df.loc[index, 'quality'] = None
        df.loc[index, 'agreement'] = None
        df.loc[index, 'accuracy'] = None
        df.loc[index, 'justification'] = f"Erro: {e}"
    
    progress_bar.value += 1
        
# Save the updated DataFrame
output_filename = f"final_evaluation.csv"
df.to_csv(output_filename, index=False)
print(f"Completed. Results saved in {output_filename}")

IntProgress(value=0, bar_style='info', max=3)

Converting list of DataFrames to a single DataFrame.
min. pause...
min. pause...
Completed. Results saved in final_evaluation.csv


In [5]:
df = pd.read_csv("final_evaluation.csv")

df_final = df[['model', 'quality', 'agreement', 'accuracy']].copy()

# Converter colunas para numérico, tratando erros com NaN
df_final.loc[:, 'quality'] = pd.to_numeric(df_final['quality'], errors='coerce')
df_final.loc[:, 'agreement'] = pd.to_numeric(df_final['agreement'], errors='coerce')
df_final.loc[:, 'accuracy'] = pd.to_numeric(df_final['accuracy'], errors='coerce')

# Criar a coluna 'evaluation' com a média das outras três
df_final['evaluation'] = df_final[['quality', 'agreement', 'accuracy']].mean(axis=1)

print(df_final)


                      model  quality  agreement  accuracy  evaluation
0    gpt-4o-mini-2024-07-18      2.0        2.0       1.0    1.666667
1    gpt-4o-mini-2024-07-18      4.0        3.0       2.0    3.000000
2    gpt-4o-mini-2024-07-18      2.0        2.0       1.0    1.666667
3    gpt-4o-mini-2024-07-18      3.0        2.0       1.0    2.000000
4    gpt-4o-mini-2024-07-18      2.0        2.0       1.0    1.666667
..                      ...      ...        ...       ...         ...
295          TeenyTinyLlama      2.0        1.0       1.0    1.333333
296          TeenyTinyLlama      3.0        2.0       2.0    2.333333
297          TeenyTinyLlama      2.0        1.0       0.0    1.000000
298          TeenyTinyLlama      3.0        2.0       1.0    2.000000
299          TeenyTinyLlama      3.0        2.0       2.0    2.333333

[300 rows x 5 columns]


In [None]:
from scipy import stats
from scikit_posthocs import posthoc_dunn

# H0: The medians of all groups are equal.
# H1: At least one group median is different from the others.


In [33]:
quality = df_final.groupby('model')['quality'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Renomear as colunas para melhor clareza
summary_quality = quality.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Imprimir o DataFrame de resumo
print("Summary Quality:")
display(summary_quality)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['quality'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='quality', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Quality:


Unnamed: 0_level_0,mean,std_err,min,max,median
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TeenyTinyLlama,2.57,0.057305,1.0,4.0,3.0
TeenyTinyLlama-160m-CEP-ft,2.37,0.067652,1.0,4.0,2.0
gpt-4o-mini-2024-07-18,2.49,0.079766,2.0,7.0,2.0



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(9.224071771680308), pvalue=np.float64(0.009931578152185849)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,0.009009,0.122882
TeenyTinyLlama-160m-CEP-ft,0.009009,1.0,1.0
gpt-4o-mini-2024-07-18,0.122882,1.0,1.0


In [34]:
agreement = df_final.groupby('model')['agreement'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Renomear as colunas para melhor clareza
summary_agreement = agreement.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Imprimir o DataFrame de resumo
print("Summary Agreement:")
display(summary_agreement)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['agreement'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='agreement', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Agreement:


Unnamed: 0_level_0,mean,std_err,min,max,median
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TeenyTinyLlama,1.84,0.069224,1.0,5.0,2.0
TeenyTinyLlama-160m-CEP-ft,1.95,0.090314,1.0,5.0,2.0
gpt-4o-mini-2024-07-18,2.13,0.081222,1.0,8.0,2.0



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(10.126937276647105), pvalue=np.float64(0.006323587191046993)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,1.0,0.007601
TeenyTinyLlama-160m-CEP-ft,1.0,1.0,0.051887
gpt-4o-mini-2024-07-18,0.007601,0.051887,1.0


In [37]:
accuracy = df_final.groupby('model')['accuracy'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Renomear as colunas para melhor clareza
summary_accuracy = accuracy.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Imprimir o DataFrame de resumo
print("Summary Accuracy:")
display(summary_accuracy)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['accuracy'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='accuracy', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Accuracy:


Unnamed: 0_level_0,mean,std_err,min,max,median
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TeenyTinyLlama,1.5,0.096922,0.0,5.0,1.0
TeenyTinyLlama-160m-CEP-ft,1.46,0.089239,1.0,5.0,1.0
gpt-4o-mini-2024-07-18,1.2,0.063564,1.0,6.0,1.0



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(9.60391995532927), pvalue=np.float64(0.008213632725633928)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,1.0,0.014719
TeenyTinyLlama-160m-CEP-ft,1.0,1.0,0.033977
gpt-4o-mini-2024-07-18,0.014719,0.033977,1.0


In [38]:
mean_evaluation = df_final.groupby('model')['evaluation'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Renomear as colunas para melhor clareza
summary_mean_evaluation = mean_evaluation.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Imprimir o DataFrame de resumo
print("Summary Mean Evaluation:")
display(summary_mean_evaluation)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['evaluation'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='evaluation', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary Mean Evaluation:


Unnamed: 0_level_0,mean,std_err,min,max,median
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TeenyTinyLlama,1.97,0.061972,1.0,4.0,2.0
TeenyTinyLlama-160m-CEP-ft,1.926667,0.070525,1.0,4.0,1.666667
gpt-4o-mini-2024-07-18,1.94,0.069531,1.333333,7.0,1.666667



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(2.93169492935915), pvalue=np.float64(0.230882243997579))
Kruskal-Wallis test is not significant, skipping Dunn's test.
