In [None]:
import pandas as pd

db_arc = pd.read_parquet("data/ARC_test/test-00000-of-00001.parquet")
db_mmlu = pd.read_parquet("data/MMLU_test/test-00000-of-00001.parquet")
db_hellaswag = pd.read_json("/mnt/4d4f90e5-f220-481e-8701-f0a546491c35/arquivos/projetos/frame-llm/data/hellaswag_test/hellaswag_val.jsonl", lines=True)


In [None]:
display(db_arc.head())
print(db_arc.count())

In [None]:
display(db_mmlu.head())
print(db_mmlu.count())

In [None]:
display(db_hellaswag.head())
print(db_hellaswag.count())

In [None]:
db_arc_tmp = db_arc.copy()
db_mmlu_tmp = db_mmlu.copy()
db_hellaswag_tmp = db_hellaswag.copy()

In [None]:
def create_arc_final(db_arc_tmp):
    def format_arc_question(row):
        question = row['question']
        choices = row['choices']['text']

        choices_str = "\n".join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)])

        return f"Question: {question}\nChoices:\n{choices_str}"

    arc_final = db_arc_tmp.apply(lambda row: pd.Series({
        'item': format_arc_question(row),
        'answer': row['answerKey']
    }), axis=1)
    return arc_final

arc_final = create_arc_final(db_arc_tmp)
display(arc_final)

In [None]:
def create_mmlu_final(db_mmlu_tmp):
    def format_mmlu_question(row):
        question = row['question']
        choices = row['choices']

        choices_str = "\n".join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)])

        return f"Question: {question}\nChoices:\n{choices_str}"

    mmlu_final = db_mmlu_tmp.apply(lambda row: pd.Series({
        'item': format_mmlu_question(row),
        'answer': chr(65 + int(row['answer']))
    }), axis=1)
    return mmlu_final

mmlu_final = create_mmlu_final(db_mmlu_tmp)
display(mmlu_final)


### HellaSwag

*   **ind**: dataset ID
*   **activity\_label**: The ActivityNet or WikiHow label for this example
*   **context**: There are two formats. The full context is in `ctx`. When the context ends in an (incomplete) noun phrase, like for ActivityNet, this incomplete noun phrase is in `ctx_b`, and the context up until then is in `ctx_a`. This can be useful for models such as BERT that need the last sentence to be complete. However, it's never required. If `ctx_b` is nonempty, then `ctx` is the same thing as `ctx_a`, followed by a space, then `ctx_b`.
*   **endings**: a list of 4 endings. The correct index is given by `label` (0,1,2, or 3)
*   **split**: train, val, or test.
*   **split\_type**: `indomain` if the activity label is seen during training, else `zeroshot`
*   **source\_id**: Which video or WikiHow article this example came from

In [None]:
def create_hellaswag_prompt_simple(db_hellaswag_tmp):
    def format_hellaswag_item(row):
        ctx = row['ctx']
        endings = row['endings']
        choices_str = "\n".join([f"{chr(65 + i)}. {ending}" for i, ending in enumerate(endings)])

        return f"Context: {ctx}\nOptions:\n{choices_str}"

    formatted = db_hellaswag_tmp.apply(lambda row: pd.Series({
        'item': format_hellaswag_item(row),
        'answer': chr(65 + int(row['label']))
    }), axis=1)

    return formatted


hellaswag_final = create_hellaswag_prompt_simple(db_hellaswag_tmp)
display(hellaswag_final)


In [None]:
# Add a column indicating the source of the data
mmlu_final['source'] = 'MMLU'
mmlu_final = mmlu_final[['source', 'item', 'answer']]

hellaswag_final['source'] = 'HellaSwag'
hellaswag_final = hellaswag_final[['source', 'item', 'answer']]

arc_final['source'] = 'ARC'
arc_final = arc_final[['source', 'item', 'answer']]

# Concatenate all dataframes
all_final = pd.concat([mmlu_final, hellaswag_final, arc_final], ignore_index=True)
all_final.to_csv('data/all_sources.csv', index=False)

display(all_final.head())
print(all_final.describe())


In [None]:
# First estimation n = itens x dataset with equal number of samples from each 'source' category
import numpy as np

np.random.seed(42)
sample_n = 45 * 3 # n = itens x dataset

# Print total and proportion for each source
source_counts = all_final['source'].value_counts()
source_proportions = all_final['source'].value_counts(normalize=True)
print("Source counts:\n", source_counts)
print("\nSource proportions:\n", source_proportions)

# Get the unique sources
sources = all_final['source'].unique()
n_sources = len(sources)

# Calculate how many samples per source (as equal as possible)
samples_per_source = sample_n // n_sources
remainder = sample_n % n_sources

# Sample equally from each source
sampled_dfs = []
for i, source in enumerate(sources):
    n_samples = samples_per_source + (1 if i < remainder else 0)
    df_source = all_final[all_final['source'] == source]
    # If there are not enough samples in a source, take all available
    n_samples = min(n_samples, len(df_source))
    sampled_dfs.append(df_source.sample(n=n_samples, random_state=42))

sample_all_final = pd.concat(sampled_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
sample_all_final.to_csv('data/sample.csv', index=False)
display(sample_all_final)
print(sample_all_final['source'].value_counts())
print(sample_all_final['source'].value_counts(normalize=True))


# **First Step**
## **Zero-Shot prompt engineering for Chain‐of‐Thought and JSON output for itens**

In [None]:
# Import sample
df = pd.read_csv('data/sample.csv')

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
from openai import OpenAI
import google.generativeai as genai
import anthropic

# Specify the path to your .env file
dotenv_path = "/mnt/4d4f90e5-f220-481e-8701-f0a546491c35/arquivos/projetos/.env"
load_dotenv(dotenv_path=dotenv_path)

# Access and store the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")
claude_api_key = os.getenv("ANTHROPIC_API_KEY")
xai_api_key = os.getenv("XAI_API_KEY")
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")

# Config client
client_gpt = OpenAI(api_key=openai_api_key)
client_claude = anthropic.Anthropic(api_key=claude_api_key)
client_grok = OpenAI(api_key=xai_api_key, base_url="https://api.x.ai/v1")
client_ds = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")


In [None]:
# The LLM output must be a JSON containing: step by step solution, answer
models = {
    'model_gpt_4o_mini': {'model': 'gpt-4o-mini', 'client': client_gpt},
    'model_gpt_41_nano': {'model': 'gpt-4.1-nano', 'client': client_gpt},
    'model_claude_35_haiku': {'model': 'claude-3-5-haiku-latest', 'client': client_claude},
    'model_grok_3_mini_beta': {'model': 'grok-3-mini-beta', 'client': client_grok},
    'model_ds_v3': {'model': 'deepseek-chat', 'client': client_ds}
}

def create_prompt_from_row(row, type):
    """
    Create a prompt for an LLM to solve a multiple-choice question, 
    encouraging a divide and conquer approach. Output must be valid JSON.
    """

    if type == 'cot':
        prompt = (
            "You are an expert at solving multiple-choice questions. "
            "Read the following question and its options carefully:\n"
            f"{row['item']}\n\n"
            "To arrive at the solution, break down the problem into smaller, manageable stages wherever possible. "
            "Enumerate and show your reasoning step by step (1., 2., ...), then select the answer you believe is correct (A, B, C or D).\n\n"
            "Your output must be a single valid JSON object with these fields (no extra text or markdown):\n"
            "```json\n"
            "{\n"
            '  "CoT": "<detailed reasoning here>",\n'
            '  "answer": "<A, B, C or D>",\n'
            '  "justification": "<brief explanation of your choice>"\n'
            "}\n"
            "```\n"
            "Ensure the JSON is parseable by a standard JSON parser (double quotes for keys, no trailing commas)."
        )
    elif type == 'naive':
        prompt = (
            "Read the following question and its options:\n"
            f"{row['item']}\n\n"
            "Select the answer you believe is correct (A, B, C or D).\n\n"
            "Your output must be a single valid JSON object with these fields (no extra text or markdown):\n"
            "```json\n"
            "{\n"
            '  "answer": "<A, B, C or D>",\n'
            '  "justification": "<brief explanation of your choice>"\n'
            "}\n"
            "```\n"
            "Ensure the JSON is parseable by a standard JSON parser (double quotes for keys, no trailing commas)."
        )
    elif type == 'adversarial':
        prompt = (
            "You are an expert at solving multiple-choice questions, but your reasoning might be flawed. "
            "Read the following question and its options carefully. Consider that some subtle aspects of the question might be designed to mislead you:\n\n"
            f"{row['item']}\n\n"
            "Also, be aware that there might be an alternative interpretation or a hidden assumption within the question that could change the correct answer.\n\n"
            "If you determine that none of A, B, C or D is fully correct, choose E and provide your own alternative answer.\n\n"
            "To arrive at the solution, meticulously examine each step of your reasoning and question every assumption you make. "
            "Enumerate and show your reasoning step by step (1., 2., ...), then select the answer you believe is correct (A, B, C, D or E).\n\n"
            "Your output must be a single valid JSON object with these fields (no extra text or markdown):\n"
            "```json\n"
            "{\n"
            '  "CoT": "<detailed reasoning here, questioning your own assumptions>",\n'
            '  "answer": "<A, B, C, D or E>",\n'
            '  "alternative_answer": "<text of your E option, or null if not used>",\n'
            '  "justification": "<brief explanation of your choice, highlighting potential doubts or uncertainties>"\n'
            "}\n"
            "```\n"
            "Ensure the JSON is parseable by a standard JSON parser (double quotes for keys, no trailing commas)."
        )

    return prompt

# Example usage for i-th row:
i = 0
print(create_prompt_from_row(row=df.iloc[i], type='cot')+'\n\n')
print(create_prompt_from_row(row=df.iloc[i], type='naive')+'\n\n')
print(create_prompt_from_row(row=df.iloc[i], type='adversarial')+'\n\n')


In [None]:
def step_one(model_name, prompt):
    """
    Queries the specified LLM with the given prompt and returns the generated text.

    The function retrieves the model configuration from the 'models' dictionary,
    then uses the appropriate client (OpenAI, Gemini or Anthropic) to generate
    a response.  The generated text is extracted from the response object and returned.

    Args:
        model_name (str): The name of the model to use (key in the 'models' dictionary).
        prompt (str): The prompt to send to the model.

    Returns:
        str: The generated text from the model.

    Raises:
        ValueError: If the model_name is not found in the 'models' dictionary,
                    or if an unsupported client type is encountered.
    """
    model_config = models.get(model_name)
    if not model_config:
        raise ValueError(f"Model '{model_name}' not found in the 'models' dictionary.")

    client = model_config['client']
    model = model_config['model']

    if isinstance(client, OpenAI):
        response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}])
        generated_text = response.choices[0].message.content
    # elif isinstance(client, genai.Client):
    #     response = client.models.generate_content(model=model, contents=prompt)
    #     generated_text = response.text
    elif isinstance(client, anthropic.Anthropic):
        response = client.messages.create(model=model, max_tokens=1000, messages=[{"role": "user", "content": prompt}])
        generated_text = response.content[0].text
    else:
        raise ValueError("Unsupported client type.")

    return generated_text

In [None]:
print(df.head())

In [None]:
import pandas as pd
from tqdm import tqdm
import concurrent.futures

df_cp = df.copy()
prompt_types = ['cot', 'naive', 'adversarial']

def process_model(model_name, df_model):
    results = []
    print(model_name)
    for prompt_type in prompt_types:
        for i in tqdm(range(len(df_model)), desc=f"Processing {model_name} - {prompt_type}"):
            for r in range(5):  # Repeat the call 5 times
                prompt = create_prompt_from_row(row=df_model.iloc[i], type=prompt_type)
                output = step_one(model_name, prompt)
                results.append({
                    'source': df_model.iloc[i]['source'],
                    'item': df_model.iloc[i]['item'],
                    'r': r,
                    'model': model_name,
                    'prompt_type': prompt_type,
                    'output': output
                })
    return pd.DataFrame(results)

# Prepare model-specific dataframes
model_dfs = {}
for model_name in models.keys():
    model_dfs[model_name] = df_cp.copy()  # Create a copy for each model

# Parallelize the processing of each model
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=len(models)) as executor:
    futures = {executor.submit(process_model, model_name, model_dfs[model_name]): model_name for model_name in models.keys()}
    for future in concurrent.futures.as_completed(futures):
        model_name = futures[future]
        try:
            model_results = future.result()
            results.append(model_results)
        except Exception as e:
            print(f"Error processing {model_name}: {e}")

# Concatenate results from all models
results_df = pd.concat(results, ignore_index=True)

# Merge results back into the original dataframe
df_cp = pd.merge(df_cp, results_df, on=['source', 'item'], how='left')

# Save the updated dataframe
df_cp.to_csv('data/res_step_one.csv', index=False)
print(df_cp.head())


In [None]:
# Concatenate results from all models
results_df = pd.concat(results, ignore_index=True)

# Merge results back into the original dataframe
df_cp = pd.merge(df_cp, results_df, on=['source', 'item'], how='left')

# Save the updated dataframe
df_cp.to_csv('data/res_step_one.csv', index=False)
print(df_cp.head())


In [None]:
df_cp = df_cp[df_cp['model'] != 'model_deepseek_reasoner']
df_cp.to_csv('data/res_step_one_clean.csv', index=False)
