In [1]:
import os
import pandas as pd
import re

# Define the path to the results directory
results_dir = "results"

# Function to extract model name from filename
def extract_model_name(filename):
    match = re.search(r"experimental_design_results_(.*)\.csv", filename)
    if match:
        return match.group(1)
    else:
        return "Unknown Model"

# Function to read CSV files from a directory, add 'model' column, and select specific columns
def read_and_label_csvs(directory):
    dfs = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath, encoding='utf-8')
            
                model_name = extract_model_name(filename)
                df['model'] = model_name  # Add model name as a column
                
                # Select specific columns
                df = df[['model', 'baseline', 'results']]
                
                dfs.append(df)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    return dfs

# Read and label CSV files from the results directory
df = read_and_label_csvs(results_dir)
print(df)

[                     model                                           baseline  \
0   gpt-4o-mini-2024-07-18  o CEP do endereço QNM 07 CONJUNTO H LOTE 17 S/...   
1   gpt-4o-mini-2024-07-18  o CEP do endereço QI 13 LOTE 01 A 14 ALA 01 BO...   
2   gpt-4o-mini-2024-07-18  o CEP do endereço SHC/ SW CLSW 103 BL A S/N LO...   
3   gpt-4o-mini-2024-07-18  o CEP do endereço 06 CONJUNTO 05/06 LOTE 3-A S...   
4   gpt-4o-mini-2024-07-18  o CEP do endereço SHCS CR QD 516, BLOCO B 69 1...   
..                     ...                                                ...   
95  gpt-4o-mini-2024-07-18  o CEP do endereço QD 104 CJ 09 LT 02 RECANTO D...   
96  gpt-4o-mini-2024-07-18  o CEP do endereço QNB 13 LOTE 37 CASA 01 TAGUA...   
97  gpt-4o-mini-2024-07-18  o CEP do endereço CRS 502 BL C LJ 37 PARTE 677...   
98  gpt-4o-mini-2024-07-18  o CEP do endereço QUADRA 1 COMERCIO LOCAL S/N ...   
99  gpt-4o-mini-2024-07-18  o CEP do endereço QS 9 RUA 122 LOTE 08 LOJA 03...   

                          

In [2]:
import os
from dotenv import load_dotenv
import openai
from tqdm import tqdm
import time

# Specify the path to your .env file
dotenv_path = "/mnt/4d4f90e5-f220-481e-8701-f0a546491c35/arquivos/projetos/.env"

# Load the .env file
load_dotenv(dotenv_path=dotenv_path)

# Access and store the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")
model = 'gpt-4o-mini-2024-07-18'

In [6]:
import json
from ipywidgets import IntProgress
from IPython.display import display
import pandas as pd

# Initialize progress bar
progress_bar = IntProgress(min=0, max=len(df), desc='Avaliando', bar_style='info')
display(progress_bar)

# Ensure df is a pandas DataFrame
if isinstance(df, list):
    print("Converting list of DataFrames to a single DataFrame.")
    df = pd.concat(df, ignore_index=True)

# Iterate over each row and make API call
for index, row in df.iterrows():
    if index % 100 == 0 and index != 0:
        print("min. pause...")
        time.sleep(60)
    baseline = row['baseline']
    result = row['results']
    temperature = 0.0
    comparison_promt = f"""Avalie as respostas abaixo de 0 a 10, considerando os seguintes critérios:

        *Qualidade da Resposta*: A resposta é clara, concisa e bem estruturada? Ela aborda diretamente a questão ou tarefa proposta?
        *Concordância*: A resposta faz sentido no contexto da pergunta ou instrução? Ela segue uma linha de raciocínio coerente?
        *Precisão/Acurácia*: A resposta apresenta informações corretas e verificáveis comparadas com o baseline? Ela evita informações falsas ou enganosas?

        Resposta Base (Baseline): {baseline}
        Resposta do Modelo: {result}

        Atribua uma nota de 0 a 10 para a comparação entre as respostas (Baseline e Modelo), justificando brevemente sua avaliação. Retorne a avaliação no formato JSON, com as chaves "quality", "agreement", "accuracy" e "justification".
        """
        
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": comparison_promt}],
            temperature=temperature
        )
        generated_text = response.choices[0].message.content
        
        # Load the JSON string
        metadata_str = generated_text.strip()
        if metadata_str.startswith("```json") and metadata_str.endswith("```"):
            metadata_str = metadata_str.strip("```").strip()
        if metadata_str.startswith("json"):
            metadata_str = metadata_str[4:].lstrip()
        try:
            data = json.loads(metadata_str)
            # Update DataFrame with the extracted values
            df.loc[index, 'quality'] = data.get('quality')
            df.loc[index, 'agreement'] = data.get('agreement')
            df.loc[index, 'accuracy'] = data.get('accuracy')
            df.loc[index, 'justification'] = data.get('justification')
            
        except json.JSONDecodeError as e:
            print(f"Erro ao decodificar JSON: {e}")
            print(f"String JSON com problema: {metadata_str}")
            # Optionally, store the error message in the DataFrame
            df.loc[index, 'quality'] = None
            df.loc[index, 'agreement'] = None
            df.loc[index, 'accuracy'] = None
            df.loc[index, 'justification'] = f"Erro de JSON: {e}"

    except Exception as e:
        print(f"Erro ao processar a linha {index}: {e}")
        df.loc[index, 'quality'] = None
        df.loc[index, 'agreement'] = None
        df.loc[index, 'accuracy'] = None
        df.loc[index, 'justification'] = f"Erro: {e}"
    
    progress_bar.value += 1
        
# Save the updated DataFrame
output_filename = f"final_evaluation.csv"
df.to_csv(output_filename, index=False)
print(f"Completed. Results saved in {output_filename}")

IntProgress(value=0, bar_style='info', max=300)

min. pause...
min. pause...
Completed. Results saved in final_evaluation.csv


In [23]:
import pandas as pd
df = pd.read_csv("final_evaluation.csv")

df_final = df[['model', 'quality', 'agreement', 'accuracy']].copy()

# Converter colunas para numérico, tratando erros com NaN
df_final.loc[:, 'quality'] = pd.to_numeric(df_final['quality'], errors='coerce')
df_final.loc[:, 'agreement'] = pd.to_numeric(df_final['agreement'], errors='coerce')
df_final.loc[:, 'accuracy'] = pd.to_numeric(df_final['accuracy'], errors='coerce')

# Criar a coluna 'evaluation' com a média das outras três
df_final['evaluation'] = df_final[['quality', 'agreement', 'accuracy']].mean(axis=1)

print(df_final)


                      model  quality  agreement  accuracy  evaluation
0    gpt-4o-mini-2024-07-18      2.0        3.0       1.0    2.000000
1    gpt-4o-mini-2024-07-18      3.0        2.0       2.0    2.333333
2    gpt-4o-mini-2024-07-18      2.0        2.0       1.0    1.666667
3    gpt-4o-mini-2024-07-18      2.0        2.0       1.0    1.666667
4    gpt-4o-mini-2024-07-18      2.0        2.0       1.0    1.666667
..                      ...      ...        ...       ...         ...
295          TeenyTinyLlama      1.0        1.0       1.0    1.000000
296          TeenyTinyLlama      3.0        2.0       2.0    2.333333
297          TeenyTinyLlama      2.0        1.0       1.0    1.333333
298          TeenyTinyLlama      3.0        2.0       1.0    2.000000
299          TeenyTinyLlama      3.0        2.0       2.0    2.333333

[300 rows x 5 columns]


In [30]:
quality = df_final.groupby('model')['quality'].agg(['mean', 'sem', 'min', 'max'])

# Renomear as colunas para melhor clareza
summary_quality = quality.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max'
})

# Imprimir o DataFrame de resumo
print(summary_quality)

                            mean   std_err  min  max
model                                               
TeenyTinyLlama              2.37  0.056237  1.0  3.0
TeenyTinyLlama-160m-CEP-ft  2.29  0.065590  1.0  4.0
gpt-4o-mini-2024-07-18      2.41  0.075338  2.0  7.0


In [32]:
agreement = df_final.groupby('model')['agreement'].agg(['mean', 'sem', 'min', 'max'])

# Renomear as colunas para melhor clareza
summary_agreement = agreement.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max'
})

# Imprimir o DataFrame de resumo
print(summary_agreement)

                            mean   std_err  min  max
model                                               
TeenyTinyLlama              1.75  0.065713  1.0  4.0
TeenyTinyLlama-160m-CEP-ft  1.83  0.092174  1.0  5.0
gpt-4o-mini-2024-07-18      2.01  0.067412  1.0  6.0


In [33]:
accuracy = df_final.groupby('model')['accuracy'].agg(['mean', 'sem', 'min', 'max'])

# Renomear as colunas para melhor clareza
summary_accuracy = accuracy.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max'
})

# Imprimir o DataFrame de resumo
print(summary_accuracy)

                            mean   std_err  min  max
model                                               
TeenyTinyLlama              1.33  0.066750  0.0  4.0
TeenyTinyLlama-160m-CEP-ft  1.38  0.077564  1.0  5.0
gpt-4o-mini-2024-07-18      1.17  0.068246  1.0  7.0


In [34]:
mean_evaluation = df_final.groupby('model')['evaluation'].agg(['mean', 'sem', 'min', 'max'])

# Renomear as colunas para melhor clareza
summary_mean_evaluation = mean_evaluation.rename(columns={
    'mean': 'mean',
    'sem': 'std_err',
    'min': 'min',
    'max': 'max'
})

# Imprimir o DataFrame de resumo
print(summary_mean_evaluation)

                                mean   std_err       min       max
model                                                             
TeenyTinyLlama              1.816667  0.049549  1.000000  3.000000
TeenyTinyLlama-160m-CEP-ft  1.833333  0.068247  1.000000  4.000000
gpt-4o-mini-2024-07-18      1.863333  0.063404  1.333333  6.666667
