In [1]:
import json
import numpy as np

from utils.analysis import save_pr_curves_plotly, save_roc_curves_plotly, remove_nan, get_roc_metrics, get_precision_recall_metrics, analyze

## Helper functions - Gather results for a few different seeds (average the results)

In [2]:
def build_result_files_with_seed(baseline_model, seed_path, weights, dataset_name, method, scores_folder_name, augmentations, human_augmentations):
    aigc = {
        f'claude3_{baseline_model}': f'{baseline_model}{seed_path}/{scores_folder_name}{augmentations}/{method}{weights}/{dataset_name}_claude3-haiku_scores.jsonl',
        f'codellama7b_{baseline_model}': f'{baseline_model}{seed_path}/{scores_folder_name}{augmentations}/{method}{weights}/{dataset_name}_codellama_CodeLlama-7b-Instruct-hf_scores.jsonl',
        f'codegemma7b_{baseline_model}': f'{baseline_model}{seed_path}/{scores_folder_name}{augmentations}/{method}{weights}/{dataset_name}_google_codegemma-7b-it_scores.jsonl',
        f'gpt3.5_{baseline_model}': f'{baseline_model}{seed_path}/{scores_folder_name}{augmentations}/{method}{weights}/{dataset_name}_gpt-3.5-turbo_scores.jsonl',
        f'gpt4o_mini_{baseline_model}': f'{baseline_model}{seed_path}/{scores_folder_name}{augmentations}/{method}{weights}/{dataset_name}_gpt-4o-mini_scores.jsonl',
        f'codellama13b_{baseline_model}': f'{baseline_model}{seed_path}/{scores_folder_name}{augmentations}/{method}{weights}/{dataset_name}_codellama_CodeLlama-13b-Instruct-hf_scores.jsonl',
        f'starchat_{baseline_model}': f'{baseline_model}{seed_path}/{scores_folder_name}{augmentations}/{method}{weights}/{dataset_name}_HuggingFaceH4_starchat-alpha_scores.jsonl',
    }
    if 'deepmind_code_contests' in dataset_name:
        # either deepmind_code_contests_cpp or deepmind_code_contests_java, we replace it with deepmind_code_contests
        human = f'{baseline_model}{seed_path}/{scores_folder_name}{human_augmentations}/{method}{weights}/deepmind_code_contests_human_scores.jsonl'    
    else:
        human = f'{baseline_model}{seed_path}/{scores_folder_name}{human_augmentations}/{method}{weights}/{dataset_name}_human_scores.jsonl'
    return aigc, human

def calculate_ensemble_auroc(baseline_model, dataset_name, weights, seeds, method='entropy', scores_folder_name='scores_infer_task', augmentations='', human_augmentations='', plot=False):
    # Get relevant paths
    aigcs_paths = {}
    humans_paths = []
    for seed_path in ['' if seed is None else f'_seed_{seed}' for seed in seeds]:
        aigc, human = build_result_files_with_seed(baseline_model, seed_path, weights, dataset_name, method, scores_folder_name, augmentations, human_augmentations)
        for k in aigc:
            if k not in aigcs_paths:
                aigcs_paths[k] = []
            aigcs_paths[k].append(aigc[k])
        humans_paths.append(human)
    
    # Load human scores from paths
    human_scores = {}
    aigcs_scores = {}
    for human_path in humans_paths:
        with open(human_path, 'r') as f:
            data_lines = [json.loads(x) for x in f.readlines()]
            for data_line in data_lines:
                key = list(data_line.keys())[0]
                if key not in human_scores:
                    human_scores[key] = []
                human_scores[key].append(data_line[key]['human_score'])
    
    # Load AIGC scores from paths
    for exp in aigcs_paths:
        aigcs_scores[exp] = {}
        for aigc_path in aigcs_paths[exp]:
            with open(aigc_path, 'r') as f:
                data_lines = [json.loads(x) for x in f.readlines()]
                for data_line in data_lines:
                    key = list(data_line.keys())[0]
                    if key not in aigcs_scores[exp]:
                        aigcs_scores[exp][key] = []
                    aigcs_scores[exp][key].append(data_line[key]['aigc_score'])
    
    # Mean the results
    for key in human_scores:
        human_scores[key] = np.mean(human_scores[key])
    for exp in aigcs_scores:
        for key in aigcs_scores[exp]:
            aigcs_scores[exp][key] = np.mean(aigcs_scores[exp][key])
    
    # Compute final AUROC
    raw_outputs = []
    final_scores = {}
    for exp_name in aigcs_scores:
        output = analyze([v for _, v in human_scores.items()], [v for _, v in aigcs_scores[exp_name].items()], exp_name)
        raw_outputs.append(output)
        final_scores[exp_name] = {'roc_auc': output['metrics']['roc_auc'], 'num_nans': output['num_nans']}

    if plot:
        save_roc_curves_plotly(raw_outputs)
        save_pr_curves_plotly(raw_outputs)
    
    return final_scores

## Calculate AUROC on the example score files

In [3]:
# Example score files are under "results/codellama13b_seed_*/scores_atc/entropy/comments_0_docstrings_0/*_scores.jsonl"
dataset_name = 'google-research-datasets_mbpp' 
weights = '/comments_0_docstrings_0'

seeds = [142, 242, 342, 442]

In [None]:
import pandas as pd

df_data = []
nans = []
for baseline_name, outputs in [
        ('ATC - 1 Seed', calculate_ensemble_auroc(baseline_model='codellama13b', dataset_name=dataset_name, weights=weights, seeds=[seeds[0]], method='entropy', scores_folder_name='scores_atc', augmentations='', human_augmentations='')),
        ('ATC - 2 Seeds', calculate_ensemble_auroc(baseline_model='codellama13b', dataset_name=dataset_name, weights=weights, seeds=seeds[:2], method='entropy', scores_folder_name='scores_atc', augmentations='', human_augmentations='')),
        ('ATC - 3 Seeds', calculate_ensemble_auroc(baseline_model='codellama13b', dataset_name=dataset_name, weights=weights, seeds=seeds[:3], method='entropy', scores_folder_name='scores_atc', augmentations='', human_augmentations='')),
        ('ATC - 4 Seeds', calculate_ensemble_auroc(baseline_model='codellama13b', dataset_name=dataset_name, weights=weights, seeds=seeds, method='entropy', scores_folder_name='scores_atc', augmentations='', human_augmentations='')),
    ]:
    for exp_name in outputs:
        curr_data = {}
        curr_data['baseline'] = baseline_name
        curr_data['model'] = exp_name
        curr_data['ROCAUC'] = outputs[exp_name]['roc_auc']
        df_data.append(curr_data)
        nans.append({'baseline': baseline_name, 'num_nans': outputs[exp_name]['num_nans'], 'model': exp_name})

result_df = pd.DataFrame(df_data).pivot(index='baseline', columns='model', values='ROCAUC')
result_df['avg'] = result_df.mean(axis=1)

result_df.mul(100).round(2)

## Plot ROC/PR Curves

In [None]:
calculate_ensemble_auroc(baseline_model='codellama13b', dataset_name=dataset_name, weights=weights, seeds=seeds, method='entropy', scores_folder_name='scores_atc', augmentations='', human_augmentations='', plot=True)