In [1]:
%load_ext autoreload
%autoreload 2

In [130]:
from nanoeval.metrics.standard import _bootstrap_distribution, pass_at_k
import pandas as pd
import numpy as np

# Assuming the _bootstrap_distribution function is already defined as in previous responses.

# Step 1: Define the list of instances
base_instances = ['a', 'b', 'c', 'd', 'e']
# Create 25 variants of each base instance
instances = [f'{inst}_{i}' for inst in base_instances for i in range(25)]

# Initialize lists to hold data for samples_df and answer_group_correctness_df
samples_data = []
answer_group_correctness_data = []

# Set the number of attempts per instance and number of answer groups per instance
num_attempts_per_instance = 100
num_answer_groups_per_instance = 10

# Step 2 to 4: Iterate over each instance to generate attempts and answer groups
for instance_name in instances:
    # Step 2: Sample a pass rate for the instance
    # Using a bimodal beta distribution as in the original code
    if np.random.rand() > 0.5:
        pass_rate = np.random.beta(1000, 10)
    else:
        pass_rate = np.random.beta(10, 1000)
    
    # Step 3: Generate correctness for each attempt based on the pass rate
    successes = np.random.binomial(1, pass_rate, size=num_attempts_per_instance)
    
    # Step 4: Assign each attempt to an answer group
    # We'll assign attempts to answer groups in a round-robin fashion
    for attempt_index in range(num_attempts_per_instance):
        answer_group_id = f'answer_group_{attempt_index % num_answer_groups_per_instance}'
        attempt_name = f'attempt_{attempt_index + 1}'
        is_correct = bool(successes[attempt_index])
        
        # Append attempt data to samples_data
        samples_data.append({
            'instance': instance_name,
            'attempt': attempt_name,
            'answer_group_id': answer_group_id
        })
        
        # Store correctness information temporarily in a dict to process later
        # We'll create a key based on instance and answer_group_id
        key = (instance_name, answer_group_id)
        if key not in answer_group_correctness_data:
            # Initialize with False; will update to True if any attempt is correct
            answer_group_correctness_data.append({
                'instance': instance_name,
                'answer_group_id': answer_group_id,
                'is_correct': is_correct
            })
        else:
            # If any attempt in the group is correct, set is_correct to True
            if is_correct:
                # Find the existing entry and set is_correct to True
                for entry in answer_group_correctness_data:
                    if entry['instance'] == instance_name and entry['answer_group_id'] == answer_group_id:
                        entry['is_correct'] = True
                        break

# Convert lists to DataFrames
samples_df = pd.DataFrame(samples_data)
answer_group_correctness_df = pd.DataFrame(answer_group_correctness_data)

# Step 7: Call the _bootstrap_distribution function
# We'll set k=1 and n_trials=1000 as in the original example
bootstrap_results = _bootstrap_distribution(
    samples_df=samples_df,
    answer_group_correctness_df=answer_group_correctness_df,
    metric_fn=pass_at_k,
    k=1,
    n_trials=1000,
    bootstrap_over_instances=False  # or True, depending on the desired behavior
)

# Print the bootstrap results
print(bootstrap_results)

Bootstrap (<function pass_at_k at 0x17b885c60>@1): 100%|██████████| 1000/1000 [00:36<00:00, 27.16it/s]

{2.5: 0.488, 16: 0.504, 50: 0.52, 84: 0.536, 97.5: 0.552}





In [102]:
import pandas as pd

# Assuming samples_df and answer_group_correctness_df are already defined

# Step 1: Inspect unique values
print("Unique values in 'samples_df['answer_group_id']':", samples_df['answer_group_id'].unique())
print("Unique values in 'answer_group_correctness_df['answer_group_id']':", answer_group_correctness_df['answer_group_id'].unique())

# Step 2: Check underlying types
print("Types in 'samples_df['answer_group_id']':", samples_df['answer_group_id'].apply(type).unique())
print("Types in 'answer_group_correctness_df['answer_group_id']':", answer_group_correctness_df['answer_group_id'].apply(type).unique())

# Step 3: Standardize data types to strings
samples_df['answer_group_id'] = samples_df['answer_group_id'].astype(str)
answer_group_correctness_df['answer_group_id'] = answer_group_correctness_df['answer_group_id'].astype(str)

# Step 4: Verify consistency
print("Post-conversion types in 'samples_df['answer_group_id']':", samples_df['answer_group_id'].apply(type).unique())
print("Post-conversion types in 'answer_group_correctness_df['answer_group_id']':", answer_group_correctness_df['answer_group_id'].apply(type).unique())

# Step 5: Perform merge
merged_df = pd.merge(
    samples_df,
    answer_group_correctness_df,
    on=['instance', 'answer_group_id'],
    how='left'
)

print("Merge successful. Merged DataFrame info:")
print(merged_df.info())

Unique values in 'samples_df['answer_group_id']': ['answer_group_0' 'answer_group_1' 'answer_group_2' 'answer_group_3'
 'answer_group_4' 'answer_group_5' 'answer_group_6' 'answer_group_7'
 'answer_group_8' 'answer_group_9']
Unique values in 'answer_group_correctness_df['answer_group_id']': ['answer_group_0' 'answer_group_1' 'answer_group_2' 'answer_group_3'
 'answer_group_4' 'answer_group_5' 'answer_group_6' 'answer_group_7'
 'answer_group_8' 'answer_group_9']
Types in 'samples_df['answer_group_id']': [<class 'str'>]
Types in 'answer_group_correctness_df['answer_group_id']': [<class 'str'>]
Post-conversion types in 'samples_df['answer_group_id']': [<class 'str'>]
Post-conversion types in 'answer_group_correctness_df['answer_group_id']': [<class 'str'>]
Merge successful. Merged DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125000 entries, 0 to 124999
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   -