In [1]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import pyoverleaf as po
import os
import numpy as np
import scipy.stats as ss
from statsmodels.sandbox.stats.multicomp import multipletests
import statsmodels.api as sm
import statsmodels.formula.api as smf
from joblib import Parallel, delayed
import warnings
from tqdm.notebook import tqdm


In [2]:
df = pd.read_csv('model scoring.csv')

print("\nIdentifying rows with NA in specified columns:")
cols_to_check = ['quality', 'agreement', 'accuracy', 'hallucination']
all_na_rows = pd.DataFrame()

for col in cols_to_check:
    na_rows = df[df[col].isna()]
    if not na_rows.empty:
        print(f"Column {col}: Found {len(na_rows)} NA values.")
        all_na_rows = pd.concat([all_na_rows, na_rows])
    else:
        print(f"Column {col}: No NA values found.")

if not all_na_rows.empty:
    print("Saving all rows with NA values to 'all_na_rows.csv'.")
    all_na_rows.to_csv('all_na_rows.csv', index=True)
else:
    print("No NA values found in any of the specified columns.")

df[cols_to_check] = df[cols_to_check].fillna(0)
df = df[df['model'] != 'o1-mini-2024-09-12']



Identifying rows with NA in specified columns:
Column quality: Found 21 NA values.
Column agreement: Found 21 NA values.
Column accuracy: Found 21 NA values.
Column hallucination: Found 21 NA values.
Saving all rows with NA values to 'all_na_rows.csv'.


In [3]:
display(df.head())

Unnamed: 0,model,attempt,temperature,top_p,rag_type,baseline,results,comparison_prompt,quality,agreement,accuracy,hallucination,justification
0,gpt-4o-mini-2024-07-18,1,0.1,0.1,2-steps RAG,A descrição correta de acordo com a Nomenclatu...,A descrição oficial do código NCM para a merca...,"Avalie as respostas abaixo de 0 a 10, em que 0...",6.0,5.0,4.0,7.0,A resposta do modelo apresenta uma descrição q...
1,gpt-4o-mini-2024-07-18,1,0.1,0.5,2-steps RAG,A descrição correta de acordo com a Nomenclatu...,A descrição oficial do código NCM para a merca...,"Avalie as respostas abaixo de 0 a 10, em que 0...",6.0,5.0,4.0,7.0,A resposta do modelo apresenta uma descrição q...
2,gpt-4o-mini-2024-07-18,1,0.1,0.9,2-steps RAG,A descrição correta de acordo com a Nomenclatu...,A descrição oficial do código NCM para a merca...,"Avalie as respostas abaixo de 0 a 10, em que 0...",6.0,5.0,4.0,3.0,A resposta do modelo apresenta uma descrição q...
3,gpt-4o-mini-2024-07-18,1,1.0,0.1,2-steps RAG,A descrição correta de acordo com a Nomenclatu...,A descrição oficial do código NCM para a merca...,"Avalie as respostas abaixo de 0 a 10, em que 0...",6.0,5.0,4.0,7.0,A resposta do modelo apresenta uma descrição q...
4,gpt-4o-mini-2024-07-18,1,1.0,0.5,2-steps RAG,A descrição correta de acordo com a Nomenclatu...,A descrição oficial do código NCM para a merca...,"Avalie as respostas abaixo de 0 a 10, em que 0...",6.0,5.0,4.0,7.0,A resposta do modelo apresenta uma descrição q...


In [4]:
print("Descriptive Statistics:")
display(df[['quality', 'agreement', 'accuracy', 'hallucination']].describe())

print("\nGrouped by model:")
model_grouped_quality = df.groupby('model')[['quality']].describe()
display(model_grouped_quality)
model_grouped_agreement = df.groupby('model')[['agreement']].describe()
display(model_grouped_agreement)
model_grouped_accuracy = df.groupby('model')[['accuracy']].describe()
display(model_grouped_accuracy)
model_grouped_hallucination = df.groupby('model')[['hallucination']].describe()
display(model_grouped_hallucination)

print("\nGrouped by temperature:")
temperature_grouped_quality = df.groupby('temperature')[['quality']].describe()
display(temperature_grouped_quality)
temperature_grouped_agreement = df.groupby('temperature')[['agreement']].describe()
display(temperature_grouped_agreement)
temperature_grouped_accuracy = df.groupby('temperature')[['accuracy']].describe()
display(temperature_grouped_accuracy)
temperature_grouped_hallucination = df.groupby('temperature')[['hallucination']].describe()
display(temperature_grouped_hallucination)

print("\nGrouped by top_p:")
top_p_grouped_quality = df.groupby('top_p')[['quality']].describe()
display(top_p_grouped_quality)
top_p_grouped_agreement = df.groupby('top_p')[['agreement']].describe()
display(top_p_grouped_agreement)
top_p_grouped_accuracy = df.groupby('top_p')[['accuracy']].describe()
display(top_p_grouped_accuracy)
top_p_grouped_hallucination = df.groupby('top_p')[['hallucination']].describe()
display(top_p_grouped_hallucination)

print("\nGrouped by rag_type:")
rag_type_grouped_quality = df.groupby('rag_type')[['quality']].describe()
display(rag_type_grouped_quality)
rag_type_grouped_agreement = df.groupby('rag_type')[['agreement']].describe()
display(rag_type_grouped_agreement)
rag_type_grouped_accuracy = df.groupby('rag_type')[['accuracy']].describe()
display(rag_type_grouped_accuracy)
rag_type_grouped_hallucination = df.groupby('rag_type')[['hallucination']].describe()
display(rag_type_grouped_hallucination)

print("\nGrouped by rag_type + model:")
df['rag_type_model'] = df['rag_type'] + '_' + df['model']
rag_type_model_grouped_quality = df.groupby('rag_type_model')[['quality']].describe()
display(rag_type_model_grouped_quality)
rag_type_model_grouped_agreement = df.groupby('rag_type_model')[['agreement']].describe()
display(rag_type_model_grouped_agreement)
rag_type_model_grouped_accuracy = df.groupby('rag_type_model')[['accuracy']].describe()
display(rag_type_model_grouped_accuracy)
rag_type_model_grouped_hallucination = df.groupby('rag_type_model')[['hallucination']].describe()
display(rag_type_model_grouped_hallucination)


Descriptive Statistics:


Unnamed: 0,quality,agreement,accuracy,hallucination
count,17640.0,17640.0,17640.0,17640.0
mean,4.011451,3.98458,3.286054,3.572336
std,2.54359,3.006482,2.805166,3.44947
min,0.0,0.0,0.0,0.0
25%,2.0,2.0,1.0,0.0
50%,3.0,2.0,2.0,3.0
75%,6.0,6.0,5.0,6.0
max,10.0,10.0,10.0,10.0



Grouped by model:


Unnamed: 0_level_0,quality,quality,quality,quality,quality,quality,quality,quality
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Mistral-7B-Instruct-v0.3,3528.0,2.336168,2.595022,0.0,0.0,2.0,4.0,9.0
TeenyTinyLlama-160m-NCM-ft,3528.0,3.726474,2.187876,0.0,2.0,3.0,6.0,8.0
deepseek-chat,3528.0,5.095238,2.37318,0.0,3.0,4.0,7.0,10.0
gemini-2.0-flash,3528.0,4.316893,2.278069,2.0,2.0,3.0,6.0,10.0
gpt-4o-mini-2024-07-18,3528.0,4.582483,2.351817,2.0,3.0,4.0,7.0,10.0


Unnamed: 0_level_0,agreement,agreement,agreement,agreement,agreement,agreement,agreement,agreement
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Mistral-7B-Instruct-v0.3,3528.0,2.335884,2.870762,0.0,0.0,2.0,3.0,10.0
TeenyTinyLlama-160m-NCM-ft,3528.0,3.694444,2.398174,0.0,1.0,4.0,5.0,9.0
deepseek-chat,3528.0,4.939626,3.190224,0.0,2.0,3.0,8.0,10.0
gemini-2.0-flash,3528.0,4.467687,2.832149,1.0,2.0,3.0,7.0,10.0
gpt-4o-mini-2024-07-18,3528.0,4.485261,2.96885,1.0,2.0,3.0,7.0,10.0


Unnamed: 0_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Mistral-7B-Instruct-v0.3,3528.0,1.910147,2.565309,0.0,0.0,1.0,2.0,10.0
TeenyTinyLlama-160m-NCM-ft,3528.0,2.765873,1.887434,0.0,1.0,2.0,4.0,8.0
deepseek-chat,3528.0,4.414966,3.086803,0.0,2.0,2.0,8.0,10.0
gemini-2.0-flash,3528.0,3.584184,2.799864,1.0,1.0,2.0,5.0,10.0
gpt-4o-mini-2024-07-18,3528.0,3.755102,2.845139,1.0,2.0,2.0,6.0,10.0


Unnamed: 0_level_0,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Mistral-7B-Instruct-v0.3,3528.0,5.5,4.166713,0.0,1.0,5.0,10.0,10.0
TeenyTinyLlama-160m-NCM-ft,3528.0,5.356859,3.09429,0.0,3.0,5.0,9.0,10.0
deepseek-chat,3528.0,2.331066,2.703017,0.0,0.0,1.0,4.0,10.0
gemini-2.0-flash,3528.0,2.308673,2.527695,0.0,0.0,2.0,4.0,10.0
gpt-4o-mini-2024-07-18,3528.0,2.365079,2.716121,0.0,0.0,2.0,4.0,10.0



Grouped by temperature:


Unnamed: 0_level_0,quality,quality,quality,quality,quality,quality,quality,quality
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
temperature,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,4.127041,2.619815,0.0,2.0,4.0,6.0,10.0
1.0,5880.0,4.009354,2.518666,0.0,2.0,3.0,6.0,10.0
1.9,5880.0,3.897959,2.485637,0.0,2.0,3.0,6.0,10.0


Unnamed: 0_level_0,agreement,agreement,agreement,agreement,agreement,agreement,agreement,agreement
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
temperature,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,4.130612,3.050677,0.0,2.0,4.0,7.0,10.0
1.0,5880.0,3.971769,2.991037,0.0,2.0,2.0,6.0,10.0
1.9,5880.0,3.851361,2.971146,0.0,2.0,2.0,6.0,10.0


Unnamed: 0_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
temperature,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,3.382313,2.825676,0.0,1.0,2.0,5.0,10.0
1.0,5880.0,3.278571,2.803228,0.0,1.0,2.0,5.0,10.0
1.9,5880.0,3.197279,2.783849,0.0,1.0,2.0,5.0,10.0


Unnamed: 0_level_0,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
temperature,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,3.504762,3.376773,0.0,0.0,3.0,5.0,10.0
1.0,5880.0,3.484014,3.425153,0.0,0.0,3.0,6.0,10.0
1.9,5880.0,3.728231,3.539722,0.0,0.0,3.0,6.0,10.0



Grouped by top_p:


Unnamed: 0_level_0,quality,quality,quality,quality,quality,quality,quality,quality
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
top_p,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,4.126361,2.621341,0.0,2.0,3.0,6.0,10.0
0.5,5880.0,3.972959,2.523464,0.0,2.0,3.0,6.0,10.0
0.9,5880.0,3.935034,2.480308,0.0,2.0,3.0,6.0,10.0


Unnamed: 0_level_0,agreement,agreement,agreement,agreement,agreement,agreement,agreement,agreement
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
top_p,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,4.130272,3.052253,0.0,2.0,4.0,7.0,10.0
0.5,5880.0,3.936395,3.000261,0.0,2.0,2.0,6.0,10.0
0.9,5880.0,3.887075,2.961252,0.0,2.0,2.0,6.0,10.0


Unnamed: 0_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
top_p,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,3.386905,2.833498,0.0,1.0,2.0,5.0,10.0
0.5,5880.0,3.248639,2.801917,0.0,1.0,2.0,5.0,10.0
0.9,5880.0,3.222619,2.7775,0.0,1.0,2.0,5.0,10.0


Unnamed: 0_level_0,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
top_p,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,5880.0,3.510544,3.382598,0.0,0.0,3.0,5.0,10.0
0.5,5880.0,3.539796,3.474611,0.0,0.0,3.0,6.0,10.0
0.9,5880.0,3.666667,3.488827,0.0,0.0,3.0,6.0,10.0



Grouped by rag_type:


Unnamed: 0_level_0,quality,quality,quality,quality,quality,quality,quality,quality
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG,8820.0,5.295011,2.590605,0.0,3.0,6.0,7.0,10.0
Common RAG,8820.0,2.727891,1.712826,0.0,2.0,3.0,3.0,9.0


Unnamed: 0_level_0,agreement,agreement,agreement,agreement,agreement,agreement,agreement,agreement
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG,8820.0,5.562472,3.086253,0.0,3.0,6.0,8.0,10.0
Common RAG,8820.0,2.406689,1.890476,0.0,2.0,2.0,2.0,10.0


Unnamed: 0_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG,8820.0,4.70034,3.047959,0.0,2.0,4.0,7.0,10.0
Common RAG,8820.0,1.871769,1.564574,0.0,1.0,2.0,2.0,10.0


Unnamed: 0_level_0,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG,8820.0,3.932426,3.255178,0.0,1.0,4.0,6.0,10.0
Common RAG,8820.0,3.212245,3.597707,0.0,0.0,2.0,5.0,10.0



Grouped by rag_type + model:


Unnamed: 0_level_0,quality,quality,quality,quality,quality,quality,quality,quality
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG_Mistral-7B-Instruct-v0.3,1764.0,3.64229,2.811257,0.0,2.0,3.0,6.0,9.0
2-steps RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,3.77381,2.20847,0.0,2.0,4.0,6.0,8.0
2-steps RAG_deepseek-chat,1764.0,6.75907,2.0684,0.0,5.0,7.0,8.0,10.0
2-steps RAG_gemini-2.0-flash,1764.0,5.945011,2.014365,2.0,5.0,6.0,8.0,10.0
2-steps RAG_gpt-4o-mini-2024-07-18,1764.0,6.354875,1.927478,2.0,5.0,7.0,8.0,10.0
Common RAG_Mistral-7B-Instruct-v0.3,1764.0,1.030045,1.468019,0.0,0.0,0.0,2.0,8.0
Common RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,3.679138,2.166678,0.0,2.0,3.0,6.0,8.0
Common RAG_deepseek-chat,1764.0,3.431406,1.203771,2.0,3.0,3.0,3.0,9.0
Common RAG_gemini-2.0-flash,1764.0,2.688776,1.009916,2.0,2.0,2.0,3.0,7.0
Common RAG_gpt-4o-mini-2024-07-18,1764.0,2.810091,1.03138,2.0,2.0,3.0,3.0,8.0


Unnamed: 0_level_0,agreement,agreement,agreement,agreement,agreement,agreement,agreement,agreement
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG_Mistral-7B-Instruct-v0.3,1764.0,3.725057,3.233901,0.0,1.0,2.0,7.0,10.0
2-steps RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,3.741497,2.403925,0.0,1.0,4.0,5.0,9.0
2-steps RAG_deepseek-chat,1764.0,7.199546,2.689843,0.0,5.0,8.0,9.0,10.0
2-steps RAG_gemini-2.0-flash,1764.0,6.481293,2.507661,1.0,4.0,7.0,9.0,10.0
2-steps RAG_gpt-4o-mini-2024-07-18,1764.0,6.664966,2.539199,1.0,5.0,7.0,9.0,10.0
Common RAG_Mistral-7B-Instruct-v0.3,1764.0,0.946712,1.472183,0.0,0.0,0.0,2.0,9.0
Common RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,3.647392,2.392164,0.0,1.0,4.0,5.0,9.0
Common RAG_deepseek-chat,1764.0,2.679705,1.704492,1.0,2.0,2.0,2.0,10.0
Common RAG_gemini-2.0-flash,1764.0,2.454082,1.282387,1.0,2.0,2.0,2.0,8.0
Common RAG_gpt-4o-mini-2024-07-18,1764.0,2.305556,1.295375,1.0,2.0,2.0,2.0,9.0


Unnamed: 0_level_0,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy,accuracy
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG_Mistral-7B-Instruct-v0.3,1764.0,3.129252,2.985741,0.0,1.0,2.0,5.0,10.0
2-steps RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,2.797052,1.899838,0.0,1.0,3.0,4.0,8.0
2-steps RAG_deepseek-chat,1764.0,6.453515,2.90453,0.0,4.0,7.0,9.0,10.0
2-steps RAG_gemini-2.0-flash,1764.0,5.437075,2.706688,1.0,3.0,5.0,8.0,10.0
2-steps RAG_gpt-4o-mini-2024-07-18,1764.0,5.684807,2.739053,1.0,3.0,5.0,9.0,10.0
Common RAG_Mistral-7B-Instruct-v0.3,1764.0,0.691043,1.12986,0.0,0.0,0.0,1.0,9.0
Common RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,2.734694,1.874968,0.0,1.0,2.0,4.0,8.0
Common RAG_deepseek-chat,1764.0,2.376417,1.519781,1.0,2.0,2.0,2.0,10.0
Common RAG_gemini-2.0-flash,1764.0,1.731293,1.219206,1.0,1.0,1.0,2.0,9.0
Common RAG_gpt-4o-mini-2024-07-18,1764.0,1.825397,1.113579,1.0,1.0,2.0,2.0,9.0


Unnamed: 0_level_0,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination,hallucination
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rag_type_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2-steps RAG_Mistral-7B-Instruct-v0.3,1764.0,4.444444,3.749045,0.0,1.0,4.0,8.0,10.0
2-steps RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,5.381519,3.084574,0.0,3.0,5.0,9.0,10.0
2-steps RAG_deepseek-chat,1764.0,3.010771,3.201035,0.0,0.0,2.0,4.0,10.0
2-steps RAG_gemini-2.0-flash,1764.0,3.330499,2.670432,0.0,0.0,4.0,5.0,10.0
2-steps RAG_gpt-4o-mini-2024-07-18,1764.0,3.494898,2.879205,0.0,0.0,4.0,6.0,10.0
Common RAG_Mistral-7B-Instruct-v0.3,1764.0,6.555556,4.295096,0.0,1.0,10.0,10.0,10.0
Common RAG_TeenyTinyLlama-160m-NCM-ft,1764.0,5.3322,3.104653,0.0,3.0,5.0,9.0,10.0
Common RAG_deepseek-chat,1764.0,1.651361,1.856232,0.0,0.0,1.0,4.0,10.0
Common RAG_gemini-2.0-flash,1764.0,1.286848,1.887183,0.0,0.0,0.0,3.0,10.0
Common RAG_gpt-4o-mini-2024-07-18,1764.0,1.235261,1.978524,0.0,0.0,0.0,3.0,10.0


In [5]:
# Define the columns for analysis
columns = ['quality', 'agreement', 'accuracy', 'hallucination']
group_columns = ['model', 'temperature', 'top_p', 'rag_type', 'rag_type_model']

# Generate and save grouped box plots with grid layout (2x2)
for group_col in group_columns:
    if group_col == 'rag_type_model':
        fig, axes = plt.subplots(2, 2, figsize=(20, 18))  # Adjust figsize for better visualization
        title_fontsize = 22
        xlabel_fontsize = 18
        ylabel_fontsize = 18
        tick_fontsize = 16
    else:
        fig, axes = plt.subplots(2, 2, figsize=(18, 10))  # Adjust figsize for better visualization
        title_fontsize = 18
        xlabel_fontsize = 14
        ylabel_fontsize = 14
        tick_fontsize = 12

    fig.suptitle(f'Distribution of Quality Metrics grouped by RAG type', fontsize=title_fontsize + 2)

    for i, col in enumerate(columns):
        row = i // 2
        col_idx = i % 2
        sns.boxplot(data=df, x=group_col, y=col, ax=axes[row, col_idx], hue=group_col, palette="viridis", legend=False)  # Use a better palette, show legend only for the first subplot
        #axes[row, col_idx].set_xlabel(group_col, fontsize=xlabel_fontsize)  # Increase x label size
        axes[row, col_idx].set_xlabel('', fontsize=xlabel_fontsize)  # Remove x label
        axes[row, col_idx].set_ylabel(col, fontsize=ylabel_fontsize)  # Increase y label size
        axes[row, col_idx].tick_params(axis='x', rotation=45, labelsize=tick_fontsize)  # Rotate x ticks, adjust size
        axes[row, col_idx].tick_params(axis='y', labelsize=tick_fontsize)  # adjust y ticks size
        plt.setp(axes[row, col_idx].get_xticklabels(), ha="right")
        axes[row, col_idx].grid(axis='y', linestyle='--')  # Add grid lines for better readability
        axes[row, col_idx].set_title(f'Distribution of {col}', fontsize=title_fontsize - 4)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to make room for the suptitle

    # Save the figure to the 'figures' directory with 400 dpi
    filename = f'figures/{group_col}_boxplots.png'
    plt.savefig(filename, dpi=400)
    plt.close(fig) # close figure to prevent display

# Create a graphical abstract (example using the first boxplot)
# This is a placeholder; replace with your actual graphical abstract creation code
graphical_abstract_group_col = group_columns[0]  # Example: use the first group
fig_graphical_abstract, axes_graphical_abstract = plt.subplots(1, 1, figsize=(13.28, 5.31)) # Adjusted figure size to meet the minimum pixel requirements (hxw) with a 400 dpi resolution.
sns.boxplot(data=df, x=graphical_abstract_group_col, y=columns[0], ax=axes_graphical_abstract, hue=graphical_abstract_group_col, palette="viridis", legend=False)
title = f'Graphical Abstract: Distribution of {columns[0]} grouped by {graphical_abstract_group_col}'
wrapped_title = '\n'.join(title[i:i+40] for i in range(0, len(title), 40)) # Break in 40 character chunks
axes_graphical_abstract.set_title(wrapped_title, fontsize=16)
axes_graphical_abstract.set_xlabel('', fontsize=12)
axes_graphical_abstract.set_ylabel(columns[0], fontsize=12)
axes_graphical_abstract.tick_params(axis='x', rotation=45, labelsize=10)
axes_graphical_abstract.tick_params(axis='y', labelsize=10)
plt.setp(axes_graphical_abstract.get_xticklabels(), ha="right")
axes_graphical_abstract.grid(axis='y', linestyle='--')
plt.tight_layout()

# Save the graphical abstract with specified dimensions and DPI
graphical_abstract_filename = 'figures/graphical_abstract.png'
fig_graphical_abstract.savefig(graphical_abstract_filename, dpi=400)
plt.close(fig_graphical_abstract)

print(f"Graphical abstract saved to {graphical_abstract_filename} with a resolution of 400 dpi and dimensions of 531x1328 pixels (height x width).")


Graphical abstract saved to figures/graphical_abstract.png with a resolution of 400 dpi and dimensions of 531x1328 pixels (height x width).


In [6]:
# Define the columns for analysis
columns = ['quality', 'agreement', 'accuracy', 'hallucination']

# Calculate the Spearman correlation matrix
spearman_corr = df[columns].corr(method='spearman')
display(spearman_corr)

# Calculate p-values for Spearman correlation coefficients
p_values = np.zeros(spearman_corr.shape)
for i in range(len(columns)):
    for j in range(len(columns)):
        if i != j:
            _, p_values[i, j] = ss.spearmanr(df[columns[i]], df[columns[j]], nan_policy='omit')
        else:
            p_values[i, j] = np.nan  # Set diagonal p-values to NaN

# Adjust p-values using Benjamini-Hochberg correction
p_values_flat = p_values.flatten()
reject, p_values_corrected, _, _ = multipletests(p_values_flat[~np.isnan(p_values_flat)], method='fdr_bh')
p_values_corrected_matrix = np.zeros_like(p_values)
p_values_corrected_matrix[~np.isnan(p_values)] = p_values_corrected

# Create a mask for insignificant correlations (p > 0.05)
mask = p_values_corrected_matrix > 0.05

# Create a mask for the upper triangle
mask_upper = np.triu(np.ones_like(spearman_corr, dtype=bool))

# Prepare annotations: correlation value or asterisk for significance
annotations = np.empty_like(spearman_corr, dtype=object)
for i in range(spearman_corr.shape[0]):
    for j in range(spearman_corr.shape[1]):
        if mask[i, j] or mask_upper[i,j]:
            annotations[i, j] = ""  # Empty string if masked
        else:
            annotations[i, j] = f"{spearman_corr.iloc[i, j]:.2f}"
            if p_values_corrected_matrix[i, j] <= 0.05:
                annotations[i, j] += "*"  # Add asterisk if significant

# Create the scatter plot matrix with Seaborn's PairGrid
g = sns.PairGrid(df[columns])
g.map_lower(sns.scatterplot, color="black")  # Plot scatter plots in the lower triangle
g.map_diag(sns.histplot, color="lightgray")  # Plot histograms on the diagonal
g.map_upper(sns.kdeplot, cmap="Blues")       # Plot densities in the upper triangle

# Add correlation annotations to the plots
for i in range(len(columns)):
    for j in range(i+1, len(columns)):  # Only fill the upper triangle
        g.axes[i, j].annotate(f"Corr: {spearman_corr.iloc[i, j]:.2f}\n", 
                              xy=(0.5, 0.5), xycoords="axes fraction", ha="center", va="center", 
                              fontsize=16 + 4, color="gray") # Increased fontsize by 4

# Increase the font size of x and y labels
for ax in g.axes.flat:
    if ax is not None:
        ax.set_xlabel(ax.get_xlabel(), fontsize=10 + 4)  # Increase x label size by 4
        ax.set_ylabel(ax.get_ylabel(), fontsize=10 + 4)  # Increase y label size by 4

# Adding the title
plt.subplots_adjust(top=0.9)  # Adjust the top value as needed
title = "Pairwise scatter plot matrix with Spearman’s correlations\nand Benjamini-Hochberg correction."
plt.suptitle(title, fontsize=18 + 4) # Increased fontsize by 4


# Save the figure
filename = 'figures/pairplot_spearman_correlation_matrix.png'
plt.savefig(filename, dpi=400, bbox_inches='tight') # Increased DPI to 400
plt.close()

print(f"Spearman correlation pairplot saved to {filename}")


Unnamed: 0,quality,agreement,accuracy,hallucination
quality,1.0,0.958889,0.976611,-0.276643
agreement,0.958889,1.0,0.954939,-0.284378
accuracy,0.976611,0.954939,1.0,-0.283889
hallucination,-0.276643,-0.284378,-0.283889,1.0


Spearman correlation pairplot saved to figures/pairplot_spearman_correlation_matrix.png


In [7]:
# Ensure categorical variables are properly set
categorical_vars = ['model', 'temperature', 'top_p', 'attempt', 'rag_type']
for var in categorical_vars:
    df[var] = df[var].astype('category')

# Reshape the data into long format for multivariate modeling
df_long = df.melt(id_vars=['attempt', 'model', 'temperature', 'top_p', 'rag_type'], 
                  value_vars=['quality', 'agreement', 'accuracy', 'hallucination'], 
                  var_name='response_type', value_name='value')



# Define the mixed model formula with interaction effects
formula = "value ~ model + temperature + top_p + rag_type + model:temperature + model:top_p + temperature:top_p"

# Fit a Multivariate Linear Mixed Model (MLMM) with random intercepts for prompts
md_mlmm = smf.mixedlm(formula, data=df_long, groups=df_long['attempt'], 
                       re_formula="~1", vc_formula={"response_type": "0 + C(response_type)"})

mdf_mlmm = md_mlmm.fit(method="cg")
print(mdf_mlmm.summary())
# assumptions are not satisfied. solve by bootstrap

                               Mixed Linear Model Regression Results
Model:                           MixedLM              Dependent Variable:              value       
No. Observations:                70560                Method:                          REML        
No. Groups:                      196                  Scale:                           6.4044      
Min. group size:                 360                  Log-Likelihood:                  -166720.4121
Max. group size:                 360                  Converged:                       Yes         
Mean group size:                 360.0                                                             
---------------------------------------------------------------------------------------------------
                                                       Coef.  Std.Err.    z     P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------
Intercept                      

In [8]:
api = po.Api()
api.login_from_browser()
# project_id = "67ec712c95ada7264a1dfa17" # 9 pg
project_id = "67bcb7190363417c095b80b3" # original
root_folder = api.project_get_files(project_id).children
figures_folder_id = next(file.id for file in root_folder if file.name == "figures")

figures_dir = "figures"
for filename in os.listdir(figures_dir):
    if filename.endswith(".png"):
        filepath = os.path.join(figures_dir, filename)
        try:
            file_bytes = open(filepath, "rb").read()
            new_file = api.project_upload_file(project_id, figures_folder_id, filename, file_bytes)
            print(f"Uploaded {filename} successfully.")
        except Exception as e:
            print(f"Error uploading {filename}: {e}")


Uploaded temperature_boxplots.png successfully.
Uploaded top_p_boxplots.png successfully.
Uploaded rag_type_model_boxplots.png successfully.
Uploaded rag_type_boxplots.png successfully.
Uploaded model_boxplots.png successfully.
Uploaded pairplot_spearman_correlation_matrix.png successfully.
Uploaded graphical_abstract.png successfully.


In [9]:
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import warnings
from tqdm.notebook import tqdm

def bootstrap_iteration(data, formula):
    # Sample with replacement the group identifiers
    unique_attempts = data['attempt'].unique()
    sampled_attempts = np.random.choice(unique_attempts, size=len(unique_attempts), replace=True)
    
    sample_list = []
    # Reassign a new group ID to each copy to avoid conflicts in the model
    for new_id, attempt in enumerate(sampled_attempts):
        group_data = data[data['attempt'] == attempt].copy()
        group_data['bootstrap_group'] = new_id
        sample_list.append(group_data)
    
    # Ensure all dataframes have the same columns before concatenating
    first_df = sample_list[0]
    for i in range(1, len(sample_list)):
        current_df = sample_list[i]
        # Add missing columns to the current dataframe
        for col in first_df.columns:
            if col not in current_df.columns:
                current_df[col] = np.nan 
        # Add missing columns to the first dataframe (to ensure consistency)
        for col in current_df.columns:
            if col not in first_df.columns:
                first_df[col] = np.nan
        sample_list[i] = current_df  # Update the list with the modified dataframe
    
    sample = pd.concat(sample_list, ignore_index=True)
    
    try:
        # Adjust the model using the new group identifiers
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            md = smf.mixedlm(formula, sample, groups=sample['bootstrap_group'],
                             re_formula="~1", vc_formula={"response_type": "0 + C(response_type)"})
            mdf = md.fit(method="cg", reml=False)
        return mdf.params
    except Exception as e:
        # Optional: log the error for debugging
        print(f"Bootstrap iteration failed: {e}")
        return None

def bootstrap_model_parallel(data, n_samples, formula, n_jobs=-1):
    # Executes the iterations in parallel; n_jobs=-1 uses all available cores
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(bootstrap_iteration)(data.copy(), formula) for _ in tqdm(range(n_samples), desc="Bootstrap Iterations")
    )
    
    # Filters out the iterations that failed
    results = [res for res in results if res is not None]
    return pd.DataFrame(results)

n_bootstrap = 100
bootstrap_results = bootstrap_model_parallel(df_long, n_bootstrap, formula)

if bootstrap_results.empty:
    print("No bootstrap iteration was successful. Consider increasing n_bootstrap or checking the model.")
else:
    bootstrap_means = bootstrap_results.mean()
    bootstrap_se = bootstrap_results.std()
    bootstrap_ci = bootstrap_results.quantile([0.025, 0.975])
    
    results_df = pd.DataFrame({
        'Original Coef': mdf_mlmm.params,
        'Bootstrap Mean': bootstrap_means,
        'Bootstrap SE': bootstrap_se,
        '95% CI Lower': bootstrap_ci.loc[0.025],
        '95% CI Upper': bootstrap_ci.loc[0.975]
    })
    
    print("\nCoefficients with Bootstrap Correction:")
    print(results_df.round(4))


Bootstrap Iterations:   0%|          | 0/100 [00:00<?, ?it/s]


Coefficients with Bootstrap Correction:
                                                    Original Coef  \
Intercept                                                  3.9133   
model[T.TeenyTinyLlama-160m-NCM-ft]                        2.0147   
model[T.deepseek-chat]                                     1.3878   
model[T.gemini-2.0-flash]                                  0.8782   
model[T.gpt-4o-mini-2024-07-18]                            0.9884   
temperature[T.1.0]                                         0.1245   
temperature[T.1.9]                                         0.3548   
top_p[T.0.5]                                               0.1662   
top_p[T.0.9]                                               0.3119   
rag_type[T.Common RAG]                                    -2.3179   
model[T.TeenyTinyLlama-160m-NCM-ft]:temperature...        -0.5910   
model[T.deepseek-chat]:temperature[T.1.0]                 -0.0457   
model[T.gemini-2.0-flash]:temperature[T.1.0]              -0.0

In [10]:
# Suppress all warnings, including FutureWarnings
warnings.simplefilter("ignore")

def bootstrap_iteration(data, formula):
    # Sample with replacement from group identifiers
    unique_attempts = data['attempt'].unique()
    sampled_attempts = np.random.choice(unique_attempts, size=len(unique_attempts), replace=True)
    
    sample_list = []
    # Reassign a new group ID for each copy, avoiding conflicts in the model
    for new_id, attempt in enumerate(sampled_attempts):
        group_data = data[data['attempt'] == attempt].copy()
        group_data['bootstrap_group'] = new_id
        sample_list.append(group_data)
    
    # Ensure all dataframes have the same columns before concatenating
    first_df = sample_list[0]
    for i in range(1, len(sample_list)):
        current_df = sample_list[i]
        for col in first_df.columns:
            if col not in current_df.columns:
                current_df[col] = np.nan 
        for col in current_df.columns:
            if col not in first_df.columns:
                first_df[col] = np.nan
        sample_list[i] = current_df
    sample = pd.concat(sample_list, ignore_index=True)
    
    try:
        # Adjust the model using the new group identifiers
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            md = smf.mixedlm(formula, sample, groups=sample['bootstrap_group'],
                             re_formula="~1", vc_formula={"response_type": "0 + C(response_type)"})
            mdf = md.fit(method="cg", reml=False)
        
        # Extract variance components
        sigma_e = mdf.scale  # Residual variance

        # For the prompt variance component (σᵤ²), prioritize vc_formula if available
        if hasattr(mdf, 'vcomp') and mdf.vcomp is not None and len(mdf.vcomp) > 0:
            sigma_u = mdf.vcomp[0]
        elif hasattr(mdf, 'cov_re') and mdf.cov_re is not None:
            if isinstance(mdf.cov_re, np.ndarray) and mdf.cov_re.size > 0:
                sigma_u = mdf.cov_re[0, 0]
            else:
                sigma_u = 0
        else:
            sigma_u = 0

        # Calculate variance explained by fixed effects (σ_f^2)
        y_pred = mdf.fittedvalues
        
        # Helper function to extract the first value of the random effect
        def get_first_effect(x):
            val = mdf.random_effects.get(x, [0])
            if isinstance(val, pd.Series):
                return val.iloc[0]
            elif isinstance(val, (list, np.ndarray)):
                return val[0]
            else:
                return 0

        random_effects_series = sample['bootstrap_group'].map(get_first_effect)
        sigma_f = np.var(y_pred - random_effects_series)

        return mdf.params, mdf.random_effects, mdf.resid, sigma_f, sigma_u, sigma_e
    except Exception as e:
        print(f"Bootstrap iteration failed: {e}")
        return None, None, None, None, None, None

def bootstrap_model_parallel(data, n_samples, formula, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(
        delayed(bootstrap_iteration)(data.copy(), formula) for _ in tqdm(range(n_samples), desc="Bootstrap Iterations")
    )
    
    valid_results = [(res[0], res[3], res[4], res[5]) for res in results if res[0] is not None]
    if not valid_results:
        return [], [], [], []
    params, sigma_f_values, sigma_u_values, sigma_e_values = zip(*valid_results)
    return params, sigma_f_values, sigma_u_values, sigma_e_values

# Bootstrap configuration
n_bootstrap = 1000
bootstrap_results, bootstrap_sigma_f, bootstrap_sigma_u, bootstrap_sigma_e = bootstrap_model_parallel(df_long, n_bootstrap, formula)

if not bootstrap_results:
    print("No bootstrap iteration was successful. Consider increasing n_bootstrap or checking the model.")
else:
    bootstrap_results_df = pd.DataFrame(bootstrap_results)
    bootstrap_means = bootstrap_results_df.mean()
    bootstrap_se = bootstrap_results_df.std()
    bootstrap_ci = bootstrap_results_df.quantile([0.025, 0.975])
    
    results_df = pd.DataFrame({
        'Original Coef': mdf_mlmm.params,
        'Bootstrap Mean': bootstrap_means,
        'Bootstrap SE': bootstrap_se,
        '95% CI Lower': bootstrap_ci.loc[0.025],
        '95% CI Upper': bootstrap_ci.loc[0.975]
    })
    
    print("\nCoefficients with Bootstrap Correction:")
    print(results_df.round(4))

    # Estimate variance components
    avg_sigma_f = np.mean(bootstrap_sigma_f)
    avg_sigma_u = np.mean(bootstrap_sigma_u)
    avg_sigma_e = np.mean(bootstrap_sigma_e)
    
    total_variance = avg_sigma_f + avg_sigma_u + avg_sigma_e

    print(f"\nEstimated Variance Components:")
    print(f"Variance from Fixed Effects (σ_f^2): {avg_sigma_f:.4f}")
    print(f"Variance from Prompts (σ_u^2): {avg_sigma_u:.4f}")
    print(f"Residual Variance (σ_e^2): {avg_sigma_e:.4f}")
    print(f"Total Variance: {total_variance:.4f}")

    # Calculate pseudo-R^2
    marginal_r2 = avg_sigma_f / total_variance if total_variance > 0 else 0
    conditional_r2 = (avg_sigma_f + avg_sigma_u) / total_variance if total_variance > 0 else 0

    print(f"Marginal Pseudo-R^2 (R^2_m): {marginal_r2:.4f}")
    print(f"Conditional Pseudo-R^2 (R^2_c): {conditional_r2:.4f}")


Bootstrap Iterations:   0%|          | 0/1000 [00:00<?, ?it/s]


Coefficients with Bootstrap Correction:
                                                    Original Coef  \
Intercept                                                  3.9133   
model[T.TeenyTinyLlama-160m-NCM-ft]                        2.0147   
model[T.deepseek-chat]                                     1.3878   
model[T.gemini-2.0-flash]                                  0.8782   
model[T.gpt-4o-mini-2024-07-18]                            0.9884   
temperature[T.1.0]                                         0.1245   
temperature[T.1.9]                                         0.3548   
top_p[T.0.5]                                               0.1662   
top_p[T.0.9]                                               0.3119   
rag_type[T.Common RAG]                                    -2.3179   
model[T.TeenyTinyLlama-160m-NCM-ft]:temperature...        -0.5910   
model[T.deepseek-chat]:temperature[T.1.0]                 -0.0457   
model[T.gemini-2.0-flash]:temperature[T.1.0]              -0.0