In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm
import time
from datasets import load_dataset

from matplotlib import pyplot as plt
# Enable auto-reload for modules
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from wildbench_eval import compose_eval_item, ARGS, parse_result, placeholder_generation, api

In [3]:
bench_data_full = load_dataset("allenai/WildBench", "v2", split="test")

In [None]:
data_dir = r"output/reasoning_traces"
# model_name, model_result_path = 'llama-8B_withoutR', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Llama-8B_withoutR.jsonl")
# model_name, model_result_path = 'llama-8B_withR', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Llama-8B_withR.jsonl")
# model_name, model_result_path = 'qwen-1p5B_withoutR', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Qwen-1.5B_withoutR.jsonl")
# model_name, model_result_path = 'qwen-1p5B_withR', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Qwen-1.5B_withR.jsonl")
# model_name, model_result_path = 'qwen-7B_withoutR', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Qwen-7B_withoutR.jsonl")
# model_name, model_result_path = 'qwen-7B_withR', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Qwen-7B_withR.jsonl")


model_name, model_result_path = 'llama-8B_withoutR_finished', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Llama-8B_withoutR_finished.jsonl")
# model_name, model_result_path = 'qwen-1p5B_withoutR_finished', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Qwen-1.5B_withoutR_finished.jsonl")
# model_name, model_result_path = 'qwen-7B_withoutR_finished', os.path.join(data_dir, r"WildBench_DeepSeek-R1-Distill-Qwen-7B_withoutR_finished.jsonl")


eval_output_file = model_result_path.replace('.jsonl', '_eval.jsonl')

args = ARGS(
    mode="score",
    eval_template="wildbench_eval_template.score.v2.md",
    start_idx=0,
    end_idx=-1,
    model="gpt-4o-20240513",
    max_words_to_eval=-1,
    eval_output_file=eval_output_file,
    save_interval = 1
)

openai_args = {
    "prompt": "TODO",
    "temperature": 0,
    "max_tokens": 1024,
    "stop": [],
}

# build batch requests

In [20]:
sensitive_id_fp = r"output/wildbench_sensitive_ids.txt"
sensitive_ids = []
if os.path.exists(sensitive_id_fp):
    with open(sensitive_id_fp, "r") as f:
        sensitive_ids = [line.strip() for line in f.readlines()]
sensitive_ids

['4e291c89184a4817', '50e0d808f0a641c8']

In [21]:
target_model_data_full = []
with open(model_result_path, 'r') as f:
    for line in f:
        tmp = json.loads(line)
        if '<｜end▁of▁sentence｜>' not in tmp['response']:
            continue
        end_think_count = len(tmp['response'].split('</think>'))
        if end_think_count != 2:
            continue
        if tmp['unique_id'] in sensitive_ids:
            continue
        d_tmp = {}
        d_tmp['session_id'] = tmp['unique_id']
        d_tmp['output'] = [tmp['response'].split('</think>')[-1].strip().replace('<｜end▁of▁sentence｜>', '')]
        d_tmp['generator'] = model_name
        target_model_data_full.append(d_tmp)
print(f"Total {len(target_model_data_full)} samples in {model_name} dataset")

Total 727 samples in qwen-7B_withoutR_finished dataset


In [22]:
target_model_data = sorted(target_model_data_full, key=lambda x: x['session_id'])
print([e['session_id'] for e in target_model_data][:5])

l_common_id = [e['session_id'] for e in target_model_data]
print('len of l_common_id:', len(l_common_id))
l_common_id = list(set(l_common_id))
print('len of l_common_id:', len(l_common_id))
bench_data = [e for e in bench_data_full if e['session_id'] in l_common_id]

ref_model_data = [None] * len(target_model_data)

histories = []
last_queries = []
checklists = []
for t, r in zip(target_model_data, ref_model_data):
    id_ = t['session_id']
    b = [e for e in bench_data if e['session_id'] == id_][0]
    compose_eval_item(b, t, r, histories, last_queries, checklists)
print(f"len(target_model_data)={len(target_model_data)}")
print(f"len(ref_model_data)={len(ref_model_data)}")

candidates = list(target_model_data)
references = list(ref_model_data)    
results = placeholder_generation(args, candidates, references, histories, last_queries, checklists)

['0023794913314551', '002bc5c909264c8c', '007d897c50e447de', '00c7916a072b4947', '00f46b5fca4c4801']
len of l_common_id: 727
len of l_common_id: 727
len(target_model_data)=727
len(ref_model_data)=727
Loaded the eval_template from wildbench_eval_template.score.v2.md
# examples in candidates: 727; We take 727 for evaluation.


In [23]:
l_request = []
for ind, item in tqdm(enumerate(results), total=len(results), desc=f"Evaluating: {args.eval_output_file} "):
    computed = False
    if item["result"] != "N/A" and item.get("error", "N/A") == "N/A" and "parsed_result" in item:  
        results[ind]["parsed_result"] = parse_result(results[ind]["result"], eval_mode=args.mode) # redo the parsing 
        results[ind]["parsed"] = True if results[ind]["parsed_result"] is not None else False
        computed = True  
        continue
        
    openai_args["prompt"] = item["prompt"]

    request = {
        "custom_id": f"run-{model_name}-{item['session_id']}",
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": "gpt-4o-20240513-batch",
            "messages": [
                {"role": "user", "content": openai_args['prompt']}
            ],
        "temperature": 0,
        "max_tokens": 1024,
        "stop": [],
        'top_p': 0.95,
        'frequency_penalty': 0,
        'presence_penalty': 0,
        'response_format': {"type": "json_object"},
        }
    }

    l_request.append(request)

Evaluating: /mnt/SAE-Reasoning/output/prompt_0_temp_0p6/WildBench_DeepSeek-R1-Distill-Qwen-7B_withoutR_finished_eval.jsonl : 100%|██████████| 727/727 [00:00<00:00, 582920.86it/s]


In [24]:
input_file_path = model_result_path.replace(".jsonl", "_eval_requests.jsonl")

with open(input_file_path, "w") as f:
    for req in l_request:
        f.write(json.dumps(req) + "\n")

# process batch request results

In [13]:
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return [json.loads(line) for line in lines]

results = load_jsonl(model_result_path.replace(".jsonl", "_eval_requests.results.jsonl"))
len(results)

729

In [14]:
l_eval_results = []
for ind, item in tqdm(enumerate(results), total=len(results), desc=f"Evaluating: {args.eval_output_file} "):
    parsed_result = parse_result(results[ind]["response"]['body']['choices'][0]['message']['content'], eval_mode=args.mode) # redo the parsing
    d_tmp = {
        "session_id": results[ind]["custom_id"].split("-")[-1].strip(),
        "result": results[ind]["response"]['body']['choices'][0]['message']['content'],
        "parsed_result": parsed_result,
        "parsed": True if parsed_result is not None else False,
        "error": results[ind]["error"],
    }
    l_eval_results.append(d_tmp)


Evaluating: /mnt/SAE-Reasoning/output/prompt_0_temp_0p6/WildBench_DeepSeek-R1-Distill-Llama-8B_withoutR_finished_eval.jsonl : 100%|██████████| 729/729 [00:00<00:00, 157952.66it/s]


In [15]:
with open(args.eval_output_file, "w") as f:
    json.dump(l_eval_results, f, indent=2)