# **Second Step: Zero-Shot Semantic Interval Rubric with model-internal evaluation**

In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import anthropic

# Load .env
dotenv_path = "/mnt/4d4f90e5-f220-481e-8701-f0a546491c35/arquivos/projetos/.env"
load_dotenv(dotenv_path=dotenv_path)

# Retrieve API keys
openai_api_key     = os.getenv("OPENAI_API_KEY")
claude_api_key     = os.getenv("ANTHROPIC_API_KEY")
xai_api_key        = os.getenv("XAI_API_KEY")
deepseek_api_key   = os.getenv("DEEPSEEK_API_KEY")

# Configure clients
client_gpt     = OpenAI(api_key=openai_api_key)
client_claude  = anthropic.Anthropic(api_key=claude_api_key)
client_grok    = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
client_ds      = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")

# Registry of supported models
models = {
    'model_gpt_4o_mini': {'alias': '4o-mini', 'model': 'gpt-4o-mini', 'client': client_gpt},
    'model_gpt_41_nano': {'alias': '4.1-nano', 'model': 'gpt-4.1-nano', 'client': client_gpt},
    'model_claude_35_haiku': {'alias': '3-haiku', 'model': 'claude-3-5-haiku-latest', 'client': client_claude    },
    'model_grok_3_mini_beta': {'alias': 'grok-3-mini', 'model': 'grok-3-mini-beta', 'client': client_grok},
    'model_ds_v3': {'alias': 'ds-v3', 'model': 'deepseek-chat', 'client': client_ds}
}


In [3]:
def step_two(model_name, prompt):
    model_config = models.get(model_name)
    if not model_config:
        raise ValueError(f"Model '{model_name}' not found in the 'models' dictionary.")

    client = model_config['client']
    model = model_config['model']

    if isinstance(client, OpenAI):
        response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}])
        generated_text = response.choices[0].message.content
    
    elif isinstance(client, anthropic.Anthropic):
        response = client.messages.create(model=model, max_tokens=1000, messages=[{"role": "user", "content": prompt}])
        generated_text = response.content[0].text
    else:
        raise ValueError("Unsupported client type.")

    return generated_text

def step_two_recovered(model_name, prompt):
    # Extract model identifier from the model name
    model = model_name.split('_', 1)[-1]
    model_config = model_name
    if not model_config:
        raise ValueError(f"Model '{model_name}' not found in the 'models' dictionary.")

    # Assign the appropriate client and model based on the model name
    if model_name == 'model_gpt_4o_mini':
        openai_api_key = os.getenv("OPENAI_API_KEY")
        client = OpenAI(api_key=openai_api_key)
        model = 'gpt-4o-mini'
        
    elif model_name == 'model_gpt_41_nano':
        openai_api_key = os.getenv("OPENAI_API_KEY")
        client = OpenAI(api_key=openai_api_key)
        model = 'gpt-4.1-nano'
        
    elif model_name == 'model_claude_35_haiku':
        claude_api_key = os.getenv("ANTHROPIC_API_KEY")
        client = anthropic.Anthropic(api_key=claude_api_key)
        model = 'claude-3-5-haiku-latest'
        
    elif model_name == 'model_grok_3_mini_beta':
        xai_api_key = os.getenv("XAI_API_KEY")
        client = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
        model = 'grok-3-mini-beta'

    elif model_name == 'model_ds_v3':
        deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
        client = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")
        model = 'deepseek-chat'

    else:
        raise ValueError(f"Unsupported model name: {model_name}")

    # Generate response based on client type
    if isinstance(client, OpenAI):
        response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}])
        generated_text = response.choices[0].message.content
    elif isinstance(client, anthropic.Anthropic):
        response = client.messages.create(model=model, max_tokens=1000, messages=[{"role": "user", "content": prompt}])
        generated_text = response.content[0].text
    else:
        raise ValueError("Unsupported client type.")

    return generated_text

In [5]:
import pandas as pd

# Load the CSV data into a DataFrame
df = pd.read_csv('data/tidydata2cmr.csv').fillna('')
print(df.shape)
display(df.head())

(10125, 15)


Unnamed: 0,source,item,answer,r,model,prompt_type,model_answer,hit,model_alternative_answer,hit_alternative,alternative_response,justification,extra_text,CoT,cot_steps
0,ARC,Question: Which of the following traits is mos...,A,0,model_gpt_4o_mini,cot,A,1,A,1,,Body weight is significantly affected by envir...,,1. Body weight is influenced by both genetics ...,5.0
1,ARC,Question: Which of the following traits is mos...,A,1,model_gpt_4o_mini,cot,A,1,A,1,,Body weight is strongly affected by environmen...,,1. To determine which trait is most influenced...,7.0
2,ARC,Question: Which of the following traits is mos...,A,2,model_gpt_4o_mini,cot,A,1,A,1,,Body weight is the trait most significantly in...,,1. Body weight is influenced by both genetics ...,5.0
3,ARC,Question: Which of the following traits is mos...,A,3,model_gpt_4o_mini,cot,A,1,A,1,,Body weight is heavily influenced by environme...,,"1. Body weight can be influenced by diet, phys...",4.0
4,ARC,Question: Which of the following traits is mos...,A,4,model_gpt_4o_mini,cot,A,1,A,1,,Body weight is the trait most affected by envi...,,1. Body weight can be heavily influenced by en...,5.0


# **Depth of Reasoning**

In [7]:
def build_depth_of_reasoning_prompt(row):
    prompt = (
        "You are a professional evaluator. Your task is to assess the Depth of Reasoning of the extracted content."
        "Assess the complexity and structure of the reasoning.\n\n"
        "Input:\n\n"
        "- Item: " f"{row["item"]}\n"
        "- Correct Answer: " f"{row["answer"]}\n\n"
        "- Extracted Answer: " f"{row["model_alternative_answer"]}\n"
        "- Extracted Justification: " f"{row["justification"]}\n"
        "- Extracted Alternative Response: " f"{row["alternative_response"]}\n"
        "- Extracted Chain of Thought: " f"{row["CoT"]}\n\n"

        "Score from 0 (no reasoning) to 10 (multi-step, layered inference).\n\n"

        "Refer to the following rubric:\n\n"
        "Score Interval: (9,10] | Multi-step, layered reasoning with strong inferential structure.\n"
        "Score Interval: (7,9]  | Sound and structured reasoning with some complexity.\n"
        "Score Interval: (5,7]  | Basic logical sequence with minimal elaboration.\n"
        "Score Interval: (3,5]  | Shallow reasoning with gaps or simplifications.\n"
        "Score Interval: (1,3]  | Fragmented logic or one-step heuristic answer.\n"
        "Score Interval: [0,1]  | No reasoning trace or incoherent rationale.\n\n"

        "Based on this rubric, assign a score from 0 to 10 for Depth of Reasoning\n"
        "Return ONLY the following JSON:\n\n"
        
        "```json\n"
        "{\n"
            "depth_of_reasoning: <float>,\n"
            "depth_of_reasoning_justification: <explanation of reasoning structure>\n"
        "}\n"
        "```\n\n"
        "Ensure the JSON is parseable by a standard JSON parser (double quotes for keys, no trailing commas)."
    )
    return prompt

i=0
row = df.iloc[i]
print(build_depth_of_reasoning_prompt(row))

You are a professional evaluator. Your task is to assess the Depth of Reasoning of the extracted content.Assess the complexity and structure of the reasoning.

Input:

- Item: Question: Which of the following traits is most influenced by the environment?
Choices:
A. Body weight
B. Eye color
C. Blood type
D. Color blindness
- Correct Answer: A

- Extracted Answer: A
- Extracted Justification: Body weight is significantly affected by environmental factors like diet and activity level, making it the trait most influenced by the environment among the options listed.
- Extracted Alternative Response: 
- Extracted Chain of Thought: 1. Body weight is influenced by both genetics and environmental factors such as diet, physical activity, and lifestyle; thus, it is highly environment-dependent. 2. Eye color is primarily determined by genetics, with minimal environmental influence. 3. Blood type is genetically determined and does not change due to environmental factors. 4. Color blindness is a ge

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import os
import concurrent.futures

In [None]:
# Get unique model names from the dataframe
unique_models = df['model'].unique()

# Prepare subsets of the dataframe for each target model
subsets_with_results = {
    model: df[df['model'] == model].copy()
    for model in unique_models
}

# Function to evaluate a pair of models (evaluator and target)
def evaluate_pair(evaluator_model, target_model):
    subset = subsets_with_results[target_model]
    generated = []

    # Iterate over each row in the subset with progress bar
    for _, row in tqdm(subset.iterrows(),
                       total=len(subset),
                       desc=f"{evaluator_model} â†’ {target_model}",
                       leave=False):
        # Build prompt for depth of reasoning assessment
        prompt = build_depth_of_reasoning_prompt(row)
        # Generate response using step_two function
        generated_text = step_two(evaluator_model, prompt)
        generated.append(generated_text)

    # Store generated responses in the corresponding column
    subsets_with_results[target_model][f'step2_{evaluator_model}'] = generated

# Create list of tasks for all model pairs (evaluator vs target)
tasks = [
    (evaluator, target)
    for evaluator in unique_models
    for target in unique_models
]

# Execute evaluation tasks in parallel using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(evaluate_pair, evaluator, target)
        for evaluator, target in tasks
    ]
    # Wait for all tasks to complete
    concurrent.futures.wait(futures)

# Save the results for each target model to CSV files
for target_model, df_result in subsets_with_results.items():
    df_result.to_csv(f"data/step2/depth_of_reasoning_{target_model}.csv", index=False)
    print(f"Saved: depth_of_reasoning_{target_model}.csv")


In [None]:
# Get unique models from the dataframe
unique_models = df['model'].unique()

# Create subsets of the dataframe for each target model
subsets_with_results = {
    model: df[df['model'] == model].copy()
    for model in unique_models
}

# List of missing model pairs (evaluator â†’ target)
missing_pairs = [
    ("model_gpt_4o_mini", "model_gpt_41_nano"),
    ("model_gpt_4o_mini", "model_ds_v3"),
    ("model_gpt_4o_mini", "model_gpt_4o_mini"),
    ("model_gpt_41_nano", "model_claude_35_haiku"),
    ("model_claude_35_haiku", "model_gpt_4o_mini"),
    ("model_gpt_41_nano", "model_gpt_4o_mini"),
    ("model_gpt_4o_mini", "model_claude_35_haiku"),
]

# Output directory path
output_dir = "data/step2"

# Main evaluation function for each model pair
def evaluate_pair(evaluator_model, target_model):
    subset = subsets_with_results[target_model]
    generated = []

    print(f"\nEvaluating: {evaluator_model} â†’ {target_model} ({len(subset)} items)")    
    for _, row in tqdm(subset.iterrows(), total=len(subset), desc=f"{evaluator_model} â†’ {target_model}", leave=False):
        try:
            # Build prompt for depth of reasoning assessment
            prompt = build_depth_of_reasoning_prompt(row)
            # Generate response using the recovery function
            generated_text = step_two_recovered(evaluator_model, prompt)
        except Exception as e:
            # Handle errors gracefully
            generated_text = f"ERROR: {str(e)}"
        generated.append(generated_text)

    # Store generated responses in the corresponding column
    subsets_with_results[target_model][f'step2_{evaluator_model}'] = generated

# Execute evaluation for each missing pair in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(evaluate_pair, evaluator, target)
        for evaluator, target in missing_pairs
    ]
    # Wait for all tasks to complete
    concurrent.futures.wait(futures)

# Save each target model's results as a separate CSV to prevent overwriting
for target_model, df_result in subsets_with_results.items():
    output_path = os.path.join(output_dir, f"depth_of_reasoning_{target_model}_recovered.csv")
    df_result.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")


In [14]:
# Sort the list of CSV files alphabetically and print
step2_dir = "data/step2"
csv_files = [f for f in os.listdir(step2_dir) if f.endswith('.csv')]
csv_files.sort()
print(csv_files)


['depth_of_reasoning_model_claude_35_haiku.csv', 'depth_of_reasoning_model_claude_35_haiku_recovered.csv', 'depth_of_reasoning_model_ds_v3.csv', 'depth_of_reasoning_model_ds_v3_recovered.csv', 'depth_of_reasoning_model_gpt_41_nano.csv', 'depth_of_reasoning_model_gpt_41_nano_recovered.csv', 'depth_of_reasoning_model_gpt_4o_mini.csv', 'depth_of_reasoning_model_gpt_4o_mini_recovered.csv', 'depth_of_reasoning_model_grok_3_mini_beta.csv', 'depth_of_reasoning_model_grok_3_mini_beta_recovered.csv']


In [23]:
import os
import pandas as pd

# Caminho para os arquivos CSV
step2_dir = 'data/step2/partial'
output_dir = 'data/step2/merged'
os.makedirs(output_dir, exist_ok=True)

# Lista de arquivos CSV na pasta (excluindo os que jÃ¡ sÃ£o "_recovered.csv")
csv_files = [f for f in os.listdir(step2_dir) if f.endswith('.csv') and not f.endswith('_recovered.csv')]

# Colunas para realizar o join
join_columns = [
    'source', 'item', 'answer', 'r', 'model', 'prompt_type',
    'model_answer', 'hit', 'model_alternative_answer',
    'hit_alternative', 'alternative_response', 'justification',
    'extra_text', 'CoT', 'cot_steps'
]

# Lista para acumular todos os merged_df
merged_list = []

for filename in csv_files:
    original_path = os.path.join(step2_dir, filename)
    recovered_path = os.path.join(step2_dir, filename.replace('.csv', '_recovered.csv'))
    
    if not os.path.exists(recovered_path):
        print(f'Arquivo nÃ£o encontrado: {recovered_path}')
        continue

    original_df = pd.read_csv(original_path)
    recovered_df = pd.read_csv(recovered_path)

    merged_df = original_df.merge(recovered_df, how='left', on=join_columns)

    # Reorganize columns apÃ³s 'cot_steps'
    cols = list(merged_df.columns)
    try:
        cot_steps_index = cols.index('cot_steps')
        after_cot_steps = cols[cot_steps_index + 1:]
        sorted_after_cot_steps = sorted(after_cot_steps)
        new_order = cols[:cot_steps_index + 1] + sorted_after_cot_steps
        merged_df = merged_df[new_order]
    except ValueError:
        pass

    # Salva o arquivo individual
    output_path = os.path.join(output_dir, filename)
    merged_df.to_csv(output_path, index=False)
    print(merged_df.shape)

    # Acumula no merged_all
    merged_list.append(merged_df)

# Concatena tudo no final
merged_all = pd.concat(merged_list, ignore_index=True)
merged_all.to_csv(os.path.join(output_dir, 'depth_of_reasoning_model_merged_all.csv'), index=False)
print('merged_all.csv salvo com shape:', merged_all.shape)


(2025, 20)
(2025, 20)
(2025, 20)
(2025, 20)
(2025, 20)
merged_all.csv salvo com shape: (10125, 20)


# **Originality**

In [8]:
def build_originality_prompt(row):
    prompt = (
        "You are a professional evaluator. Your task is to assess the Originality of the extracted content. "
        "Evaluate the degree of abstraction, rephrasing, and lexical and syntactic diversity. Identify signs of prompt "
        "echoing, i.e., shallow repetition of lexical elements or structural patterns from the input.\n\n"

        "Input:\n"
        f"- Item: {row['item']}\n"
        f"- Extracted Answer: {row['model_alternative_answer']}\n"
        f"- Extracted Justification: {row['justification']}\n"
        f"- Extracted Alternative Response: {row['alternative_response']}\n"
        f"- Extracted Chain of Thought: {row['CoT']}\n\n"

        "Also consider the following patterns indicative of originality:\n"
        "- Use of intermediate planning (e.g., step-by-step comments or structured reasoning).\n"
        "- Iterative reasoning or trial-and-error, especially evident in CoT or alternative responses.\n"
        "- Reformulation that departs from both lexical and syntactic structure of the input.\n"
        "- Introduction of new abstractions, analogies, or contextual elements not present in the original item.\n"
        "Flag low originality if responses mirror the input structure or vocabulary, even with surface paraphrasing.\n\n"

        "Score from 0 (verbatim or generic) to 10 (highly novel or creative).\n\n"
        "Refer to the following refined rubric:\n"

        "Score Interval: (9,10]  | Highly original and creative phrasing. Demonstrates abstraction, analogies, or planning. "
        "No lexical or structural echo from the prompt.\n"
        "Score Interval: (7,9]   | Strong semantic transformation with novel structure or reframing. Limited reuse of form or "
        "vocabulary is acceptable if embedded in creative formulation.\n"
        "Score Interval: (5,7]   | Contains some original phrasing or ideas, but retains significant elements from the prompt's "
        "syntax or lexicon. Limited abstraction.\n"
        "Score Interval: (3,5]   | Mostly generic or templated response. Moderate echoing in either vocabulary or sentence structure.\n"
        "Score Interval: (1,3]   | Prompt echoing dominates, including structural mimicry. Very low novelty.\n"
        "Score Interval: [0,1]   | Verbatim or near-verbatim reproduction. No abstraction or transformation detected.\n\n"

        "Based on this rubric, assign a score from 0 to 10 for Originality.\n"
        "Return ONLY the following JSON:\n\n"
        "```json\n"
        "{\n"
        '  "originality": <float>,\n'
        '  "originality_justification": "<brief statement of novelty or echoing>"\n'
        "}\n"
        "```\n\n"
        "Ensure the JSON is parseable by a standard JSON parser (double quotes for keys, no trailing commas)."
    )
    return prompt

i=0
row = df.iloc[i]
print(build_originality_prompt(row))

You are a professional evaluator. Your task is to assess the Originality of the extracted content. Evaluate the degree of abstraction, rephrasing, and lexical and syntactic diversity. Identify signs of prompt echoing, i.e., shallow repetition of lexical elements or structural patterns from the input.

Input:
- Item: Question: Which of the following traits is most influenced by the environment?
Choices:
A. Body weight
B. Eye color
C. Blood type
D. Color blindness
- Extracted Answer: A
- Extracted Justification: Body weight is significantly affected by environmental factors like diet and activity level, making it the trait most influenced by the environment among the options listed.
- Extracted Alternative Response: 
- Extracted Chain of Thought: 1. Body weight is influenced by both genetics and environmental factors such as diet, physical activity, and lifestyle; thus, it is highly environment-dependent. 2. Eye color is primarily determined by genetics, with minimal environmental influe

In [None]:
# Get unique model names from the dataframe 
unique_models = df['model'].unique()

# Prepare subsets of the dataframe for each target model
subsets_with_results = {
    model: df[df['model'] == model].copy()
    for model in unique_models
}

# Function to evaluate a pair of models (evaluator and target)
def evaluate_pair(evaluator_model, target_model):
    subset = subsets_with_results[target_model]
    generated = []

    # Iterate over each row in the subset with progress bar
    for _, row in tqdm(subset.iterrows(),
                       total=len(subset),
                       desc=f"{evaluator_model} â†’ {target_model}",
                       leave=False):
        # Build prompt for depth of reasoning assessment
        prompt = build_originality_prompt(row)
        # Generate response using step_two function
        generated_text = step_two(evaluator_model, prompt)
        generated.append(generated_text)

    # Store generated responses in the corresponding column
    subsets_with_results[target_model][f'step2_{evaluator_model}'] = generated

# Create list of tasks for all model pairs (evaluator vs target)
tasks = [
    (evaluator, target)
    for evaluator in unique_models
    for target in unique_models
]

# Execute evaluation tasks in parallel using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(evaluate_pair, evaluator, target)
        for evaluator, target in tasks
    ]
    # Wait for all tasks to complete
    concurrent.futures.wait(futures)

# Save the results for each target model to CSV files
for target_model, df_result in subsets_with_results.items():
    df_result.to_csv(f"data/step2/partial/originality_{target_model}.csv", index=False)
    print(f"Saved: originality_{target_model}.csv")


In [None]:
# Get unique models from the dataframe
unique_models = df['model'].unique()

# Create subsets of the dataframe for each target model
subsets_with_results = {
    model: df[df['model'] == model].copy()
    for model in unique_models
}

# List of missing model pairs (evaluator â†’ target)
missing_pairs = [
    ("model_gpt_4o_mini", "model_gpt_4o_mini"),
    ("model_gpt_4o_mini", "model_gpt_41_nano"),
    ("model_claude_35_haiku", "model_gpt_4o_mini"),
    ("model_gpt_4o_mini", "model_ds_v3"),
    ("model_gpt_41_nano", "model_gpt_41_nano"),
    ("model_gpt_41_nano", "model_gpt_4o_mini"),
    ("model_gpt_41_nano", "model_grok_3_mini_beta"),
    ("model_gpt_4o_mini", "model_claude_35_haiku"),
    ("model_claude_35_haiku", "model_gpt_41_nano"),
    ("model_claude_35_haiku", "model_grok_3_mini_beta")
]

# Output directory path
output_dir = "data/step2/partial"

# Main evaluation function for each model pair
def evaluate_pair(evaluator_model, target_model):
    subset = subsets_with_results[target_model]
    generated = []

    print(f"\nEvaluating: {evaluator_model} â†’ {target_model} ({len(subset)} items)")    
    for _, row in tqdm(subset.iterrows(), total=len(subset), desc=f"{evaluator_model} â†’ {target_model}", leave=False):
        try:
            # Build prompt for depth of reasoning assessment
            prompt = build_originality_prompt(row)
            # Generate response using the recovery function
            generated_text = step_two_recovered(evaluator_model, prompt)
        except Exception as e:
            # Handle errors gracefully
            generated_text = f"ERROR: {str(e)}"
        generated.append(generated_text)

    # Store generated responses in the corresponding column
    subsets_with_results[target_model][f'step2_{evaluator_model}'] = generated

# Execute evaluation for each missing pair in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(evaluate_pair, evaluator, target)
        for evaluator, target in missing_pairs
    ]
    # Wait for all tasks to complete
    concurrent.futures.wait(futures)

# Save each target model's results as a separate CSV to prevent overwriting
for target_model, df_result in subsets_with_results.items():
    output_path = os.path.join(output_dir, f"originality_{target_model}_recovered.csv")
    df_result.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")


In [35]:
# Sort the list of CSV files alphabetically and print
step2_dir = "data/step2/partial"
csv_files = [f for f in os.listdir(step2_dir) if f.endswith('.csv')]
csv_files.sort()
print(csv_files)

['originality_model_claude_35_haiku.csv', 'originality_model_claude_35_haiku_recovered.csv', 'originality_model_ds_v3.csv', 'originality_model_ds_v3_recovered.csv', 'originality_model_gpt_41_nano.csv', 'originality_model_gpt_41_nano_recovered.csv', 'originality_model_gpt_4o_mini.csv', 'originality_model_gpt_4o_mini_recovered.csv', 'originality_model_grok_3_mini_beta.csv', 'originality_model_grok_3_mini_beta_recovered.csv']


In [38]:
import os
import pandas as pd

# Caminho para os arquivos CSV
step2_dir = 'data/step2/partial'
output_dir = 'data/step2/merged'
os.makedirs(output_dir, exist_ok=True)

# Lista de arquivos CSV na pasta (excluindo os que jÃ¡ sÃ£o "_recovered.csv")
csv_files = [f for f in os.listdir(step2_dir) if f.endswith('.csv') and not f.endswith('_recovered.csv')]

# Colunas para realizar o join
join_columns = [
    'source', 'item', 'answer', 'r', 'model', 'prompt_type',
    'model_answer', 'hit', 'model_alternative_answer',
    'hit_alternative', 'alternative_response', 'justification',
    'extra_text', 'CoT', 'cot_steps'
]

# Lista para acumular todos os merged_df
merged_list = []

for filename in csv_files:
    original_path = os.path.join(step2_dir, filename)
    recovered_path = os.path.join(step2_dir, filename.replace('.csv', '_recovered.csv'))
    
    if not os.path.exists(recovered_path):
        print(f'Arquivo nÃ£o encontrado: {recovered_path}')
        continue

    original_df = pd.read_csv(original_path)
    recovered_df = pd.read_csv(recovered_path)

    merged_df = original_df.merge(recovered_df, how='left', on=join_columns)

    # Reorganize columns apÃ³s 'cot_steps'
    cols = list(merged_df.columns)
    try:
        cot_steps_index = cols.index('cot_steps')
        after_cot_steps = cols[cot_steps_index + 1:]
        sorted_after_cot_steps = sorted(after_cot_steps)
        new_order = cols[:cot_steps_index + 1] + sorted_after_cot_steps
        merged_df = merged_df[new_order]
    except ValueError:
        pass

    # Salva o arquivo individual
    output_path = os.path.join(output_dir, filename)
    merged_df.to_csv(output_path, index=False)
    print(merged_df.shape)

    # Acumula no merged_all
    merged_list.append(merged_df)

# Concatena tudo no final
merged_all = pd.concat(merged_list, ignore_index=True)
merged_all.to_csv(os.path.join(output_dir, 'originality_model_merged_all.csv'), index=False)
print('merged_all.csv salvo com shape:', merged_all.shape)


(2025, 20)
(2025, 20)
(2025, 20)
(2025, 20)
(2025, 20)
merged_all.csv salvo com shape: (10125, 20)


# **Recovery ERROR code**

In [10]:
import pandas as pd

# Load the depth of reasoning and originality datasets
depth_df = pd.read_csv('data/depth_of_reasoning_model_merged_all.csv').fillna("")
originality_df = pd.read_csv('data/originality_model_merged_all.csv').fillna("")

# Optional: Display the first few rows to verify successful loading
#display(depth_df.head(2))
#display(originality_df.head(2))

# List all column names in the current DataFrame
print(depth_df.columns.tolist())

# Create two datasets: one with errors in specified columns, another without errors

# Define the columns to check for errors
columns_to_check = [
    'step2_model_claude_35_haiku',
    'step2_model_ds_v3',
    'step2_model_gpt_41_nano',
    'step2_model_gpt_4o_mini',
    'step2_model_grok_3_mini_beta'
]

import re

# Function to identify errors in the specified columns using regex
def has_error(value):
    if pd.isnull(value):
        return False
    value_str = str(value)
    # Check if the string starts with 'Error' (case-insensitive)
    if re.match(r'^ERROR', value_str, re.IGNORECASE):
        return True
    return False

# Filter dataset with errors
deep_df_with_errors = depth_df[depth_df[columns_to_check].apply(lambda row: any(has_error(val) for val in row), axis=1)]
originality_df_with_errors = originality_df[originality_df[columns_to_check].apply(lambda row: any(has_error(val) for val in row), axis=1)]

# Filter dataset without errors
deep_df_without_errors = depth_df[~depth_df[columns_to_check].apply(lambda row: any(has_error(val) for val in row), axis=1)]
deep_dforiginality_df_without_errors = originality_df[~originality_df[columns_to_check].apply(lambda row: any(has_error(val) for val in row), axis=1)]

#display(deep_df_with_errors.head(2))
#display(originality_df_with_errors.head(2))
deep_df_with_errors_original = deep_df_with_errors.copy(deep=True)
originality_df_with_errors_original = originality_df_with_errors.copy(deep=True)

['source', 'item', 'answer', 'r', 'model', 'prompt_type', 'model_answer', 'hit', 'model_alternative_answer', 'hit_alternative', 'alternative_response', 'justification', 'extra_text', 'CoT', 'cot_steps', 'step2_model_claude_35_haiku', 'step2_model_ds_v3', 'step2_model_gpt_41_nano', 'step2_model_gpt_4o_mini', 'step2_model_grok_3_mini_beta']


In [11]:
from tqdm import tqdm

def recall_llm_and_update_depth(df, model_column):
    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Recalling {model_column}"):
        if str(row[model_column]).startswith("ERROR"):
            try:
                prompt = build_depth_of_reasoning_prompt(row)
                model_to_use = model_column.replace('step2_', '')  # ajusta o nome do modelo
                response = step_two_recovered(model_to_use, prompt)
                df.at[index, model_column] = response
            except Exception as e:
                print(f"Error processing row {index}: {e}")
                df.at[index, model_column] = f"ERROR: {e}"
    return df

def recall_llm_and_update_originality(df, model_column):
    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Recalling {model_column}"):
        if str(row[model_column]).startswith("ERROR"):
            try:
                prompt = build_originality_prompt(row)
                model_to_use = model_column.replace('step2_', '')  # ajusta o nome do modelo
                response = step_two_recovered(model_to_use, prompt)
                df.at[index, model_column] = response
            except Exception as e:
                print(f"Error processing row {index}: {e}")
                df.at[index, model_column] = f"ERROR: {e}"
    return df

# Iterate through the columns and apply the recall function
for column in columns_to_check:
    deep_df_with_errors = recall_llm_and_update_depth(deep_df_with_errors, column)
    originality_df_with_errors = recall_llm_and_update_originality(originality_df_with_errors, column)


Recalling step2_model_claude_35_haiku: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1478/1478 [00:33<00:00, 44.78it/s]
Recalling step2_model_claude_35_haiku: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4206/4206 [00:00<00:00, 13028.46it/s]
Recalling step2_model_ds_v3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1478/1478 [00:00<00:00, 12801.32it/s]
Recalling step2_model_ds_v3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4206/4206 [00:00<00:00, 12718.45it/s]
Recalling step2_model_gpt_41_nano: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1478/1478 [00:00<00:00, 12816.96it/s]
Recalling step2_model_gpt_41_nano: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4206/4206 [00:00<00:00, 12465.75it/s]
Recalling step2_model_gpt_4o_mini: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1478/1478 [56:43<00:00,  2.30s/it] 
Recalling step2_model_gpt_4o_mini: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4206/4206 [1:53:10<00:00,  1.61s/it]  
Recalling step2_model_grok_3_mini_beta: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1478/1478 [00:00<00:00, 9750.80it/s]
Recalling step

In [15]:
# Count the number of occurrences where any error exists in the specified columns of depth_df
error_deep = deep_df_with_errors[deep_df_with_errors[columns_to_check].apply(lambda row: any(has_error(val) for val in row), axis=1)].shape[0]
error_orig = originality_df_with_errors[originality_df_with_errors[columns_to_check].apply(lambda row: any(has_error(val) for val in row), axis=1)].shape[0]
print(f"Number of error occurrences: {error_deep}, {error_orig}")


Number of error occurrences: 0, 0


In [19]:
df_final_deep = pd.concat([deep_df_without_errors, deep_df_with_errors], ignore_index=True)
df_final_orig = pd.concat([deep_dforiginality_df_without_errors, originality_df_with_errors], ignore_index=True)

# Save the concatenated dataframes as CSV files
df_final_deep.to_csv('data/deep_recovered.csv', index=False)
df_final_orig.to_csv('data/originality_recovered.csv', index=False)

# **Extract from cell json - complete and tidy data**

In [15]:
import re
import pandas as pd
import numpy as np

# Load the recovered deep data
df_dor = pd.read_csv('data/deep_recovered.csv')
df_dor = df_dor.fillna("")  # Fill NaN values with empty string

# Load the recovered originality data
df_ori = pd.read_csv('data/originality_recovered.csv')
df_ori = df_ori.fillna("")  # Fill NaN values with empty string

# Lista de colunas a processar
columns_to_check = [
    'step2_model_claude_35_haiku',
    'step2_model_ds_v3',
    'step2_model_gpt_41_nano',
    'step2_model_gpt_4o_mini',
    'step2_model_grok_3_mini_beta'
]

# FunÃ§Ãµes auxiliares com regex
def extract_field_regex(text, field):
    if not isinstance(text, str):
        return None
    pattern_number = rf'"{field}"\s*:\s*([0-9]+(?:\.[0-9]+)?)'
    pattern_string = rf'"{field}"\s*:\s*"([^"]+)"'
    
    match_number = re.search(pattern_number, text)
    if match_number:
        return float(match_number.group(1))
    
    match_string = re.search(pattern_string, text)
    if match_string:
        return match_string.group(1)
    
    return None

# Aplica a extraÃ§Ã£o para cada coluna
for col in columns_to_check:
    model = col.replace('step2_model_', '')

    # df_dor: extrai depth_of_reasoning e justification
    df_dor[f'gr_dor_{model}'] = df_dor[col].apply(lambda x: extract_field_regex(x, 'depth_of_reasoning'))
    df_dor[f'just_dor_{model}'] = df_dor[col].apply(lambda x: extract_field_regex(x, 'depth_of_reasoning_justification'))

    # df_ori: extrai originality e justification
    df_ori[f'gr_ori_{model}'] = df_ori[col].apply(lambda x: extract_field_regex(x, 'originality'))
    df_ori[f'just_ori_{model}'] = df_ori[col].apply(lambda x: extract_field_regex(x, 'originality_justification'))

# Adiciona a coluna que identifica a origem da base
df_dor['criterion'] = 'depth of reasoning'
df_ori['criterion'] = 'originality'

df_combined = df_dor.merge(df_ori, on=['source', 'item', 'answer', 'r', 'model', 'prompt_type', 'model_answer',
                                        'hit', 'model_alternative_answer', 'hit_alternative',
                                        'alternative_response', 'justification', 'extra_text', 'CoT',
                                        'cot_steps'], how='left')


# Substituir strings vazias ou espaÃ§os por NaN
df_combined['cot_steps'] = df_combined['cot_steps'].replace(r'^\s*$', np.nan, regex=True)

# Converter para numÃ©rico (valores invÃ¡lidos serÃ£o convertidos em NaN)
df_combined['cot_steps'] = pd.to_numeric(df_combined['cot_steps'], errors='coerce')

df_combined.to_parquet('data/tidy_data_2_aed_model.parquet', index=False)

display(df_combined.head(3))
print(df_combined.info())

  df_combined['cot_steps'] = df_combined['cot_steps'].replace(r'^\s*$', np.nan, regex=True)


Unnamed: 0,source,item,answer,r,model,prompt_type,model_answer,hit,model_alternative_answer,hit_alternative,...,just_ori_claude_35_haiku,gr_ori_ds_v3,just_ori_ds_v3,gr_ori_gpt_41_nano,just_ori_gpt_41_nano,gr_ori_gpt_4o_mini,just_ori_gpt_4o_mini,gr_ori_grok_3_mini_beta,just_ori_grok_3_mini_beta,criterion_y
0,ARC,Question: Which of the following traits is mos...,A,0,model_gpt_41_nano,cot,A,1,A,1,...,The response demonstrates structured reasoning...,6.5,The response demonstrates structured reasoning...,7.5,The response employs structured step-by-step r...,6.5,The extracted content shows some original reas...,6.5,Response shows some original phrasing and stru...,originality
1,ARC,Question: Which of the following traits is mos...,A,1,model_gpt_41_nano,cot,A,1,A,1,...,Demonstrates moderate originality through stru...,6.0,Contains some original phrasing and structured...,7.5,"The reasoning adopts a structured, step-by-ste...",7.5,The extracted answer shows strong semantic tra...,6.0,Moderate originality from structured Chain of ...,originality
2,ARC,Question: Which of the following traits is mos...,A,2,model_gpt_41_nano,cot,A,1,A,1,...,The response demonstrates structured reasoning...,6.5,The response demonstrates structured reasoning...,8.2,"The reasoning employs a structured, step-by-st...",7.5,The response demonstrates strong semantic tran...,6.0,The response includes some original structurin...,originality


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10125 entries, 0 to 10124
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   source                          10125 non-null  object 
 1   item                            10125 non-null  object 
 2   answer                          10125 non-null  object 
 3   r                               10125 non-null  int64  
 4   model                           10125 non-null  object 
 5   prompt_type                     10125 non-null  object 
 6   model_answer                    10125 non-null  object 
 7   hit                             10125 non-null  int64  
 8   model_alternative_answer        10125 non-null  object 
 9   hit_alternative                 10125 non-null  int64  
 10  alternative_response            10125 non-null  object 
 11  justification                   10125 non-null  object 
 12  extra_text                      

In [3]:
print(df_combined.columns)


Index(['source', 'item', 'answer', 'r', 'model', 'prompt_type', 'model_answer',
       'hit', 'model_alternative_answer', 'hit_alternative',
       'alternative_response', 'justification', 'extra_text', 'CoT',
       'cot_steps', 'step2_model_claude_35_haiku_x', 'step2_model_ds_v3_x',
       'step2_model_gpt_41_nano_x', 'step2_model_gpt_4o_mini_x',
       'step2_model_grok_3_mini_beta_x', 'gr_dor_claude_35_haiku',
       'just_dor_claude_35_haiku', 'gr_dor_ds_v3', 'just_dor_ds_v3',
       'gr_dor_gpt_41_nano', 'just_dor_gpt_41_nano', 'gr_dor_gpt_4o_mini',
       'just_dor_gpt_4o_mini', 'gr_dor_grok_3_mini_beta',
       'just_dor_grok_3_mini_beta', 'criterion_x',
       'step2_model_claude_35_haiku_y', 'step2_model_ds_v3_y',
       'step2_model_gpt_41_nano_y', 'step2_model_gpt_4o_mini_y',
       'step2_model_grok_3_mini_beta_y', 'gr_ori_claude_35_haiku',
       'just_ori_claude_35_haiku', 'gr_ori_ds_v3', 'just_ori_ds_v3',
       'gr_ori_gpt_41_nano', 'just_ori_gpt_41_nano', 'gr_ori_gpt_

In [5]:
import tarfile
import os

df_combined = pd.concat([df_dor, df_ori], ignore_index=True)
csv_path = 'data/tidy_data_2_aed_model.csv'
tar_path = 'data/tidy_data_2_aed_model.tar.xz'

# Save CSV
df_combined.to_csv(csv_path, index=False)

# Compress to tar.xz
with tarfile.open(tar_path, "w:xz") as tar:
    tar.add(csv_path, arcname=os.path.basename(csv_path))

# Remove the original CSV file
os.remove(csv_path)
display(df_combined.head(2))
display(df_combined.tail(2))

Unnamed: 0,source,item,answer,r,model,prompt_type,model_answer,hit,model_alternative_answer,hit_alternative,...,justif_claude_35_haiku,grade_ds_v3,justif_ds_v3,grade_gpt_41_nano,justif_gpt_41_nano,grade_gpt_4o_mini,justif_gpt_4o_mini,grade_grok_3_mini_beta,justif_grok_3_mini_beta,criterion
0,ARC,Question: Which of the following traits is mos...,A,0,model_gpt_41_nano,cot,A,1,A,1,...,"The reasoning demonstrates a systematic, multi...",8.0,"The reasoning is sound and structured, with a ...",9.0,The reasoning demonstrates a multi-step and la...,9.0,"The reasoning demonstrates multi-step, layered...",9.5,The extracted chain of thought exhibits multi-...,depth of reasoning
1,ARC,Question: Which of the following traits is mos...,A,1,model_gpt_41_nano,cot,A,1,A,1,...,"The reasoning demonstrates a structured, multi...",9.0,The extracted chain of thought demonstrates a ...,9.0,"The reasoning demonstrates a multi-step, layer...",9.0,"The reasoning exhibits multi-step, layered inf...",9.0,The extracted chain of thought demonstrates mu...,depth of reasoning


Unnamed: 0,source,item,answer,r,model,prompt_type,model_answer,hit,model_alternative_answer,hit_alternative,...,justif_claude_35_haiku,grade_ds_v3,justif_ds_v3,grade_gpt_41_nano,justif_gpt_41_nano,grade_gpt_4o_mini,justif_gpt_4o_mini,grade_grok_3_mini_beta,justif_grok_3_mini_beta,criterion
20248,ARC,Question: Which two systems primarily function...,D,2,model_ds_v3,adversarial,E,0,E,0,...,The response demonstrates strong analytical re...,7.5,Strong semantic transformation with novel stru...,2.5,The response closely mirrors the input's struc...,5.0,The response retains significant elements from...,8.0,Strong semantic transformation with structured...,originality
20249,ARC,Question: Which two systems primarily function...,D,4,model_ds_v3,adversarial,E,0,E,0,...,The response demonstrates strong originality t...,8.5,The response demonstrates strong semantic tran...,2.0,The response mainly mirrors the input's struct...,6.5,The extracted content demonstrates some degree...,7.0,The response features structured step-by-step ...,originality


# **Classical Metrics Report - Traditional Approach**

In [10]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import scikit_posthocs as sp
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from IPython.display import display

# Load the dataset
df = pd.read_parquet('data/tidy_data_2_aed_model.parquet')

# --- Overall metrics ---
y_true = df['answer'].to_numpy()
y_pred = df['model_answer'].to_numpy()
valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
y_true = y_true[valid]
y_pred = y_pred[valid]

overall_metrics = {
    "Accuracy": [round(accuracy_score(y_true, y_pred), 4)],
    "Precision": [round(precision_score(y_true, y_pred, average='weighted', zero_division=0), 4)],
    "Recall": [round(recall_score(y_true, y_pred, average='weighted', zero_division=0), 4)],
    "F1-Score": [round(f1_score(y_true, y_pred, average='weighted', zero_division=0), 4)]
}

print("ðŸ”¹ Overall Metrics:")
display(pd.DataFrame(overall_metrics))

# --- Confusion matrix ---
labels_all = sorted(list(set(y_true) | set(y_pred)))
cm = pd.DataFrame(confusion_matrix(y_true, y_pred, labels=labels_all), index=labels_all, columns=labels_all)
print("ðŸ”¹ Overall Confusion Matrix:")
display(cm)

# --- Aggregated metrics by model ---
print("ðŸ”¹ Aggregated Metrics by Model:")

model_metrics = []
model_names = df['model'].unique()

for model in model_names:
    model_data = df[df['model'] == model]
    yt = model_data['answer'].to_numpy()
    yp = model_data['model_answer'].to_numpy()
    valid = ~pd.isna(yt) & ~pd.isna(yp)
    yt = yt[valid]
    yp = yp[valid]

    if len(yt) == 0:
        continue

    report = classification_report(yt, yp, output_dict=True, zero_division=0)

    row = {
        "Model": model,
        "Accuracy": round(report.get("accuracy", np.nan), 4),
        "Macro_Precision": round(report["macro avg"]["precision"], 4),
        "Macro_Recall": round(report["macro avg"]["recall"], 4),
        "Macro_F1": round(report["macro avg"]["f1-score"], 4),
        "Weighted_Precision": round(report["weighted avg"]["precision"], 4),
        "Weighted_Recall": round(report["weighted avg"]["recall"], 4),
        "Weighted_F1": round(report["weighted avg"]["f1-score"], 4)
    }

    model_metrics.append(row)

df_summary = pd.DataFrame(model_metrics)
display(df_summary)

# --- F1 matrix per class ---
print("ðŸ”¹ Class-level F1-Score Matrix:")

f1_matrix = []

for label in labels_all:
    f1_row = []
    for model in model_names:
        data = df[df['model'] == model]
        yt = data['answer'].to_numpy()
        yp = data['model_answer'].to_numpy()
        valid = ~pd.isna(yt) & ~pd.isna(yp)
        yt = yt[valid]
        yp = yp[valid]

        if len(yt) == 0:
            f1_row.append(np.nan)
            continue

        report = classification_report(yt, yp, output_dict=True, zero_division=0)
        f1 = report.get(label, {}).get('f1-score', 0.0)
        f1_row.append(round(f1, 4))

    f1_matrix.append(f1_row)

f1_scores = np.array(f1_matrix)
df_f1_matrix = pd.DataFrame(f1_scores, index=labels_all, columns=model_names)
display(df_f1_matrix)

# --- Filter valid models ---
f1_scores_clean = np.array(f1_scores)
valid_columns = ~np.isnan(f1_scores_clean).any(axis=0)
f1_scores_clean = f1_scores_clean[:, valid_columns]
valid_model_names = np.array(model_names)[valid_columns]

# --- Friedman test ---
print("ðŸ”¹ Friedman Test on Class-level F1-Scores:")
stat, p = stats.friedmanchisquare(*f1_scores_clean)
print(f"  Ï‡Â² = {stat:.3f}, p = {p:.4f}")

# --- Nemenyi post-hoc test ---
if p < 0.05:
    print("ðŸ”¹ Nemenyi Post-hoc Test (p < 0.05):")
    nemenyi_result = sp.posthoc_nemenyi_friedman(f1_scores_clean)
    display(nemenyi_result)

# --- Dunn's Test with Benjamini-Hochberg correction ---
if p < 0.05:
    print("ðŸ”¹ Dunn's Test with Benjamini-Hochberg correction:")
    try:
        dunn_result = sp.posthoc_dunn(f1_scores_clean, p_adjust='fdr_bh')  # Updated method
        display(dunn_result)
    except ValueError as e:
        print(f"Error in Dunn's Test: {e}")

# --- Holm-Bonferroni method ---
if p < 0.05:
    print("ðŸ”¹ Holm-Bonferroni method:")
    try:
        holm_result = sp.posthoc_dunn(f1_scores_clean, p_adjust='holm')  # Use Dunn's test with Holm adjustment
        display(holm_result)
    except ValueError as e:
        print(f"Error in Holm-Bonferroni method: {e}")

print(model_names)


ðŸ”¹ Overall Metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
0,0.8251,0.86,0.8251,0.8417


ðŸ”¹ Overall Confusion Matrix:


Unnamed: 0,Unnamed: 1,125,A,B,C,D,E
,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0
A,1,0,1872,130,31,179,112
B,0,5,183,2201,103,85,48
C,1,0,207,74,2333,58,102
D,1,0,183,69,89,1948,110
E,0,0,0,0,0,0,0


ðŸ”¹ Aggregated Metrics by Model:


Unnamed: 0,Model,Accuracy,Macro_Precision,Macro_Recall,Macro_F1,Weighted_Precision,Weighted_Recall,Weighted_F1
0,model_gpt_41_nano,0.8163,0.5787,0.5426,0.5598,0.8709,0.8163,0.8424
1,model_ds_v3,0.8454,0.7131,0.6749,0.6934,0.8938,0.8454,0.8689
2,model_gpt_4o_mini,0.7323,0.4391,0.4186,0.4254,0.7736,0.7323,0.7472
3,model_grok_3_mini_beta,0.8933,0.7322,0.7144,0.7226,0.9185,0.8933,0.905
4,model_claude_35_haiku,0.838,0.6779,0.6696,0.6736,0.8501,0.838,0.8438


ðŸ”¹ Class-level F1-Score Matrix:


Unnamed: 0,model_gpt_41_nano,model_ds_v3,model_gpt_4o_mini,model_grok_3_mini_beta,model_claude_35_haiku
,0.0,0.0,0.0,0.0,0.0
125,0.0,0.0,0.0,0.0,0.0
A,0.7922,0.8141,0.6961,0.8443,0.7911
B,0.8661,0.8948,0.7783,0.9176,0.8602
C,0.8862,0.8969,0.7868,0.932,0.8706
D,0.8146,0.8614,0.7169,0.919,0.8462
E,0.0,0.0,0.0,0.0,0.0


ðŸ”¹ Friedman Test on Class-level F1-Scores:
  Ï‡Â² = 29.631, p = 0.0000
ðŸ”¹ Nemenyi Post-hoc Test (p < 0.05):


Unnamed: 0,0,1,2,3,4
0,1.0,0.916448,0.761079,0.548529,0.997194
1,0.916448,1.0,0.252294,0.961612,0.761079
2,0.761079,0.252294,1.0,0.053284,0.916448
3,0.548529,0.961612,0.053284,1.0,0.339541
4,0.997194,0.761079,0.916448,0.339541,1.0


ðŸ”¹ Dunn's Test with Benjamini-Hochberg correction:


Unnamed: 0,1,2,3,4,5,6,7
1,1.0,1.0,0.07497,0.007098,0.005159,0.013296,1.0
2,1.0,1.0,0.07497,0.007098,0.005159,0.013296,1.0
3,0.07497,0.07497,1.0,0.433135,0.285857,0.643544,0.07497
4,0.007098,0.007098,0.433135,1.0,0.901056,0.901056,0.007098
5,0.005159,0.005159,0.285857,0.901056,1.0,0.710503,0.005159
6,0.013296,0.013296,0.643544,0.901056,0.710503,1.0,0.013296
7,1.0,1.0,0.07497,0.007098,0.005159,0.013296,1.0


ðŸ”¹ Holm-Bonferroni method:


Unnamed: 0,1,2,3,4,5,6,7
1,1.0,1.0,0.514081,0.036502,0.015476,0.085476,1.0
2,1.0,1.0,0.514081,0.036502,0.015476,0.085476,1.0
3,0.514081,0.514081,1.0,1.0,1.0,1.0,0.514081
4,0.036502,0.036502,1.0,1.0,1.0,1.0,0.036502
5,0.015476,0.015476,1.0,1.0,1.0,1.0,0.015476
6,0.085476,0.085476,1.0,1.0,1.0,1.0,0.085476
7,1.0,1.0,0.514081,0.036502,0.015476,0.085476,1.0


['model_gpt_41_nano' 'model_ds_v3' 'model_gpt_4o_mini'
 'model_grok_3_mini_beta' 'model_claude_35_haiku']
