In [11]:
import os
import pandas as pd
import re
from dotenv import load_dotenv
import openai
from tqdm import tqdm
import time
import json
from ipywidgets import IntProgress
from IPython.display import display
from scipy import stats
from scikit_posthocs import posthoc_dunn
import numpy as np


In [12]:
# Define the path to the results directory
results_dir = "results"

# Function to extract model name from filename
def extract_model_name(filename):
    match = re.search(r"experimental_design_results_(.*)\.csv", filename)
    if match:
        return match.group(1)
    else:
        return "Unknown Model"

# Function to read CSV files from a directory, add 'model' column, and select specific columns
def read_and_label_csvs(directory):
    dfs = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath, encoding='utf-8')
            
                model_name = extract_model_name(filename)
                df['model'] = model_name  # Add model name as a column
                
                # Select specific columns
                df = df[['model', 'baseline', 'results']]
                
                dfs.append(df)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    return dfs

# Read and label CSV files from the results directory
df = read_and_label_csvs(results_dir)

In [13]:
# Specify the path to your .env file
dotenv_path = "/mnt/4d4f90e5-f220-481e-8701-f0a546491c35/arquivos/projetos/.env"

# Load the .env file
load_dotenv(dotenv_path=dotenv_path)

# Access and store the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")
model = 'gpt-4o-mini-2024-07-18'

In [14]:
# Initialize progress bar
progress_bar = IntProgress(min=0, max=len(df), description='Evaluating', bar_style='info')
display(progress_bar)

# Ensure df is a DataFrame
if isinstance(df, list):
    print("Converting list of DataFrames into a single DataFrame.")
    df = pd.concat(df, ignore_index=True)

# Iterate over each row of the DataFrame
for index, row in df.iterrows():
    if index % 150 == 0 and index != 0:
        print("min. pause...")
        time.sleep(60)
    
    baseline = row['baseline']
    result = row['results']
    temperature = 0.0
    
    comparison_promt = f"""
        Atribua uma nota de 0 a 10 para resposta 'Modelo' (a ser avaliada) em comparação com a resposta 'Baseline' (resposta correta). 
        Respostas do modelo que indicam falta de acesso a informações específicas e recomendam consultar fontes externas (ex: "Desculpe, mas não tenho acesso a informações específicas como CEPs...") devem ser penalizada. 
        Não penalize a avaliação em caso de repetições no texto. Retorne a avaliação no formato JSON, com a chave "evaluation" e "justification".

        Resposta Base (Baseline): {baseline}
        Resposta do Modelo: {result}
    """
    
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": comparison_promt}],
            temperature=temperature
        )
        generated_text = response.choices[0].message.content
        
        # Process the JSON response
        metadata_str = generated_text.strip()
        if metadata_str.startswith("```json") and metadata_str.endswith("```"):
            metadata_str = metadata_str.strip("```json").strip()
        
        try:
            data = json.loads(metadata_str)
            # Update DataFrame with the extracted values
            df.loc[index, 'evaluation'] = data.get('evaluation')
            df.loc[index, 'justification'] = data.get('justification')
        
        except json.JSONDecodeError as e:
            # Print error message if JSON decoding fails
            print(f"Error decoding JSON: {e}")
            print(f"Problematic JSON string: {metadata_str}")
            df.loc[index, 'evaluation'] = None
            df.loc[index, 'justification'] = f"JSON Error: {e}"

    except Exception as e:
        # Print error message if there's an issue processing the line
        print(f"Error processing line {index}: {e}")
        df.loc[index, 'evaluation'] = None
        df.loc[index, 'justification'] = f"Error: {e}"
    
    progress_bar.value += 1
        
# Save the updated DataFrame
output_filename = "final_evaluation_simple.csv"
df.to_csv(output_filename, index=False)
print(f"Completed. Results saved in {output_filename}")

IntProgress(value=0, bar_style='info', description='Evaluating', max=3)

Converting list of DataFrames into a single DataFrame.
min. pause...
Completed. Results saved in final_evaluation_simple.csv


In [15]:
df = pd.read_csv("final_evaluation_simple.csv")
df_final = df[['model', 'evaluation']].copy()

print(df_final)


                      model  evaluation
0    gpt-4o-mini-2024-07-18         2.0
1    gpt-4o-mini-2024-07-18         2.0
2    gpt-4o-mini-2024-07-18         2.0
3    gpt-4o-mini-2024-07-18         2.0
4    gpt-4o-mini-2024-07-18         2.0
..                      ...         ...
295          TeenyTinyLlama         0.0
296          TeenyTinyLlama         1.0
297          TeenyTinyLlama         0.0
298          TeenyTinyLlama         0.0
299          TeenyTinyLlama         1.0

[300 rows x 2 columns]


In [16]:
# H0: The medians of all groups are equal.
# H1: At least one group median is different from the others.


In [17]:
def bootstrap_mean_ci(data, n_iterations=1000, confidence_level=0.95):
    """Calculates the bootstrap confidence interval for the mean."""
    means = []
    for _ in range(n_iterations):
        sample = np.random.choice(data, size=len(data), replace=True)
        means.append(np.mean(sample))
    
    alpha = (1 - confidence_level) / 2
    lower_percentile = alpha * 100
    upper_percentile = (1 - alpha) * 100
    
    lower_bound = np.percentile(means, lower_percentile)
    upper_bound = np.percentile(means, upper_percentile)
    
    return lower_bound, upper_bound


In [18]:
evaluation = df_final.groupby('model')['evaluation'].agg(['mean', 'sem', 'min', 'max', 'median'])

# Calculate bootstrap confidence intervals
evaluation['ci_lower'] = df_final.groupby('model')['evaluation'].apply(lambda x: bootstrap_mean_ci(x.dropna())[0])
evaluation['ci_upper'] = df_final.groupby('model')['evaluation'].apply(lambda x: bootstrap_mean_ci(x.dropna())[1])


# Rename columns for better clarity
summary_evaluation = evaluation.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max',
    'median': 'median'
})

# Print the summary DataFrame
print("Summary evaluation:")
display(summary_evaluation)

# Kruskal-Wallis Test
kruskal_result = stats.kruskal(*[group['evaluation'].dropna().values for name, group in df_final.groupby('model')])
significance = " (*)" if kruskal_result.pvalue < 0.05 else ""
print(f"\nKruskal-Wallis Test: {kruskal_result}{significance}")

# Dunn's Test (post-hoc)
if kruskal_result.pvalue < 0.05:
    dunn_result = posthoc_dunn(df_final, val_col='evaluation', group_col='model', p_adjust='bonferroni')
    print("\nDunn's Test (Bonferroni correction):")
    display(dunn_result)
else:
    print("Kruskal-Wallis test is not significant, skipping Dunn's test.")

Summary evaluation:


Unnamed: 0_level_0,mean,std_err,min,max,median,ci_lower,ci_upper
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
TeenyTinyLlama,0.39,0.063397,0.0,2.0,0.0,0.27,0.51025
TeenyTinyLlama-160m-CEP-ft,1.9,0.06742,0.0,3.0,2.0,1.77,2.02
gpt-4o-mini-2024-07-18,1.92,0.039389,0.0,2.0,2.0,1.84,1.98



Kruskal-Wallis Test: KruskalResult(statistic=np.float64(187.76073097873862), pvalue=np.float64(1.6915128793802754e-41)) (*)

Dunn's Test (Bonferroni correction):


Unnamed: 0,TeenyTinyLlama,TeenyTinyLlama-160m-CEP-ft,gpt-4o-mini-2024-07-18
TeenyTinyLlama,1.0,8.861055e-32,3.16198e-32
TeenyTinyLlama-160m-CEP-ft,8.861055e-32,1.0,1.0
gpt-4o-mini-2024-07-18,3.16198e-32,1.0,1.0
