In [3]:
import math
import numpy as np
import google.protobuf
import sentencepiece

6.30.0
0.2.0


In [4]:
import pandas as pd
pd.set_option('display.max_rows', 200)

import os
os.environ["HF_TOKEN"] = "hf_***REDACTED***"
from huggingface_hub import login

login(os.environ["HF_TOKEN"])
from datasets import load_dataset

hf_token = os.environ.get('HF_TOKEN')

import gc
import torch
from collections import defaultdict
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessorList, MinLengthLogitsProcessor
import accelerate
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

import re
import math


# Load the dataset split
dataset = load_dataset("cais/mmlu", "all", split="test")

mmlu_df = dataset.to_pandas()
mmlu_df = mmlu_df.drop_duplicates(subset=['question'])
mmlu_df["question"].groupby(mmlu_df["subject"]).count().to_frame().transpose()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Hugging Face Token: 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


subject,abstract_algebra,anatomy,astronomy,business_ethics,clinical_knowledge,college_biology,college_chemistry,college_computer_science,college_mathematics,college_medicine,...,professional_accounting,professional_law,professional_medicine,professional_psychology,public_relations,security_studies,sociology,us_foreign_policy,virology,world_religions
question,100,135,145,100,264,144,99,100,99,95,...,281,1534,272,608,108,244,201,99,166,171


In [5]:
subcategories = {
    "abstract_algebra": ["math"],
    "anatomy": ["health"],
    "astronomy": ["physics"],
    "business_ethics": ["business"],
    "clinical_knowledge": ["health"],
    "college_biology": ["biology"],
    "college_chemistry": ["chemistry"],
    "college_computer_science": ["computer science"],
    "college_mathematics": ["math"],
    "college_medicine": ["health"],
    "college_physics": ["physics"],
    "computer_security": ["computer science"],
    "conceptual_physics": ["physics"],
    "econometrics": ["economics"],
    "electrical_engineering": ["engineering"],
    "elementary_mathematics": ["math"],
    "formal_logic": ["philosophy"],
    "global_facts": ["other"],
    "high_school_biology": ["biology"],
    "high_school_chemistry": ["chemistry"],
    "high_school_computer_science": ["computer science"],
    "high_school_european_history": ["history"],
    "high_school_geography": ["geography"],
    "high_school_government_and_politics": ["politics"],
    "high_school_macroeconomics": ["economics"],
    "high_school_mathematics": ["math"],
    "high_school_microeconomics": ["economics"],
    "high_school_physics": ["physics"],
    "high_school_psychology": ["psychology"],
    "high_school_statistics": ["math"],
    "high_school_us_history": ["history"],
    "high_school_world_history": ["history"],
    "human_aging": ["health"],
    "human_sexuality": ["culture"],
    "international_law": ["law"],
    "jurisprudence": ["law"],
    "logical_fallacies": ["philosophy"],
    "machine_learning": ["computer science"],
    "management": ["business"],
    "marketing": ["business"],
    "medical_genetics": ["health"],
    "miscellaneous": ["other"],
    "moral_disputes": ["philosophy"],
    "moral_scenarios": ["philosophy"],
    "nutrition": ["health"],
    "philosophy": ["philosophy"],
    "prehistory": ["history"],
    "professional_accounting": ["other"],
    "professional_law": ["law"],
    "professional_medicine": ["health"],
    "professional_psychology": ["psychology"],
    "public_relations": ["politics"],
    "security_studies": ["politics"],
    "sociology": ["culture"],
    "us_foreign_policy": ["politics"],
    "virology": ["health"],
    "world_religions": ["philosophy"],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other (business, health, misc.)": ["other", "business", "health"],
}

subject_to_category = {}
for category, subjects in categories.items():
    for subject in subjects:
        subject_to_category[subject] = category
        

mmlu_df["subcategories"] = mmlu_df["subject"].apply(lambda x: subcategories[x][0])
mmlu_df["categories"] = mmlu_df["subcategories"].apply(lambda x: subject_to_category.get(x))

file_path = '/u/home/t/toz015/Agent/mmlu_df.csv'
mmlu_df.to_csv(file_path, index=False)

In [6]:
mmlu_df.groupby("subcategories").count()

Unnamed: 0_level_0,question,subject,choices,answer,categories
subcategories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
biology,453,453,453,453,453
business,436,436,436,436,436
chemistry,297,297,297,297,297
computer science,410,410,410,410,410
culture,332,332,332,332,332
economics,730,730,730,730,730
engineering,145,145,145,145,145
geography,198,198,198,198,198
health,1554,1554,1554,1554,1554
history,928,928,928,928,928


In [9]:
################################
# 1. INITIAL GENERATOR POLICIES
################################

def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s


def build_generator_prompt(
    subject,
    target_question,
    target_choices,
    get_correct
):
    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
        format_subject(subject))

    prompt += f"{target_question}"
    for i, c in enumerate(target_choices):
        prompt += "\n{}".format(c)
        
    if get_correct:
        prompt += "\nAnswer:"
    else:
        prompt += "\nIncorrect Answer:"
    return prompt


def get_generator_answer_probs(model, tokenizer, prompt_text, choices_list):
    input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids.to(model.device)
    logits = model(input_ids=input_ids).logits[0, -1]


    choices = [f"{chr(65+i)}" for i, choice in enumerate(choices_list)]
    choice_logits = []
    for letter in choices:
        token_id = tokenizer(letter, return_tensors="pt").input_ids[0, -1].item()
        choice_logits.append(logits[token_id].item())
    
    
    choice_logits = torch.tensor(choice_logits, device=model.device).float()
    probs = torch.nn.functional.softmax(choice_logits, dim=0).detach().cpu().numpy()
    
    
    choice_probs =  {choice: prob for choice, prob in zip(choices, probs)}
    
    return choice_probs



def generator_probs(subject, question, choices_list, get_correct, model, tokenizer):
    # Generate the letter answer
    choices = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices_list)]

    prompt = build_generator_prompt(subject, question, choices, get_correct)
    
    probs = get_generator_answer_probs(model, tokenizer, prompt, choices_list)
    
    return probs 



def get_initial_generator_probs(row, model, tokenizer):
    gen_init = {"correct": {}, "incorrect": {}}
    x, y_list, subject = row["question"], row["choices"], row["subject"]
    for v in [True, False]:
        choices_letter_prob = generator_probs(subject, x, y_list, v, model, tokenizer)
        if v:
            for key, val in choices_letter_prob.items():
                gen_init["correct"][key] = val
        else:
            for key, val in choices_letter_prob.items():
                gen_init["incorrect"][key] = val

    return gen_init


In [1]:
###################################
# 2. INITIAL DISCRIMINATOR POLICIES
###################################

def build_discriminator_prompt(
    subject: str,
    question: str,
    proposed_answer: str
) -> str:
    """Builds a prompt to evaluate answer correctness."""
    prompt = f"""You are an expert evaluator of questions about {format_subject(subject)}. 
Determine if the proposed answer is correct. Output ONLY 'A' or 'B'.
Question: {question}
Proposed Answer: {proposed_answer}

Is this answer correct? Respond ONLY with:
A. Correct
B. Incorrect

Answer:
"""
    return prompt


    
def get_discriminator_probs(
    model,
    tokenizer,
    prompt_text,
    choices_list
):
    input_ids = input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids.to(model.device)
    
    logits = model(input_ids=input_ids).logits[0, -1]

    choice_logits = torch.tensor(
        [
            logits[tokenizer("A").input_ids[-1]],
            logits[tokenizer("B").input_ids[-1]],
        ]
    ).float()
    
    disc_dict = {"A":"correct", "B":"incorrect"}
    probs = torch.nn.functional.softmax(choice_logits, dim=0).detach().cpu().numpy()

    choices = [f"{chr(65+i)}" for i, choice in enumerate(choices_list)]
    choice_probs =  {disc_dict[choice]: prob for choice, prob in zip(choices, probs)}

    return choice_probs


def evaluate_answer_correctness(
    row,
    model,
    tokenizer
):
    """Evaluates all possible answers for a question."""
    subject = row["subject"]
    question = row["question"]
    choices = row["choices"]
    
    results = {}
    
    for idx, answer in enumerate(choices):
        prompt = build_discriminator_prompt(
            subject=subject,
            question=question,
            proposed_answer=f"{answer}"
        )
        
        probs = get_discriminator_probs(model, tokenizer, prompt, choices)
        
        
        disc_dict_answer =  {i: f"{chr(65+i)}" for i, choice in enumerate(row["choices"])}
        
        
        results[disc_dict_answer[idx]] = probs
    

    return results

def get_initial_discriminator_probs(
    row,
    model,
    tokenizer
):
    disc_init = evaluate_answer_correctness(row, model, tokenizer)
    

    return disc_init




In [2]:
def pick_answer(gen, disc, candidates, method="generator"):
    """
    method='generator': pick argmax_y pi_G(correct|y)
    method='discriminator': pick argmax_y pi_D(correct|y)
    """
    if method == "generator":
        # For each candidate y, we look at gen["correct"][y].
        best_y = None
        best_prob = -1.0
        for y in candidates:
            p = gen["correct"][y]
            if p > best_prob:
                best_prob = p
                best_y = y
        return best_y
    else:
        best_y = None
        best_prob = -1.0
        for y in candidates:
            p = disc[y]["correct"]
            if p > best_prob:
                best_prob = p
                best_y = y
        return best_y

    

def softmax(arr):
    """Numerically stable softmax over a 1D numpy array."""
    m = np.max(arr)
    exp_vals = np.exp(arr - m)
    return exp_vals / np.sum(exp_vals)


def equilibrium_search(gen_init, disc_init, 
                       candidates, 
                       T=5000, 
                       eta_G=0.1, eta_D=0.1, 
                       lam_G=0.1, lam_D=0.01):
    """
    Runs iterative no-regret policy updates to find approximate equilibrium.
    gen_init, disc_init: dictionary form from the above initialization steps.
    """

    gen = {"correct": dict(gen_init["correct"]), 
           "incorrect": dict(gen_init["incorrect"])}
    disc = {}
    for y in candidates:
        disc[y] = dict(disc_init[y])

    Qg = {"correct": {y: 0.0 for y in candidates}, 
          "incorrect": {y: 0.0 for y in candidates}}
    Qd = {y: {"correct": 0.0, "incorrect": 0.0} for y in candidates}

    for t in range(1, T+1):
        # 1) Update Q
        for v in ["correct", "incorrect"]:
            for y in candidates:
                
                Qg[v][y] += (1.0/(2.0*t)) * disc[y][v]

        for y in candidates:
            for v in ["correct", "incorrect"]:
                
                Qd[y][v] += (1.0/(2.0*t)) * gen[v][y]

        # 2) Update generator policy
        for v in ["correct", "incorrect"]:
            logits = []
            for y in candidates:
                val = (Qg[v][y] + lam_G * math.log(gen_init[v][y] + 1e-12) )/ (1/eta_G  + lam_G)
                logits.append(val)

            new_probs = softmax(np.array(logits))

            for i, y in enumerate(candidates):
                gen[v][y] = new_probs[i]
            
        # 3) Update discriminator policy
        # Compute logits for all candidates under the same class
        logits_correct = []
        logits_incorrect = []
        for y in candidates:
            # Logit for "correct"
            val_correct = (Qd[y]["correct"] + lam_D * math.log(disc_init[y]["correct"] + 1e-12)) / (1/eta_D + lam_D)
            logits_correct.append(val_correct)

            # Logit for "incorrect"
            val_incorrect = (Qd[y]["incorrect"] + lam_D * math.log(disc_init[y]["incorrect"] + 1e-12)) / (1/eta_D + lam_D)
            logits_incorrect.append(val_incorrect)

        new_probs_correct = softmax(np.array(logits_correct))
        new_probs_incorrect = softmax(np.array(logits_incorrect))

        for i, y in enumerate(candidates):
            disc[y]["correct"] = new_probs_correct[i]
            disc[y]["incorrect"] = new_probs_incorrect[i]

    return gen, disc


In [3]:

def load_model(model_name):
    """Load one model at a time with 4-bit quantization"""
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        load_in_8bit=False,
        low_cpu_mem_usage=True,
        device_map="cuda",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    return model, tokenizer




In [14]:
def subcategory_df_function(model_d, tokenizer_d):
    category_df = mmlu_df.copy()

    gen_answer = []
    disc_answer = []

    gen_init_answer = []
    disc_init_answer = []
    corr_answer = []

    for _, row in tqdm(category_df.iterrows(), total=len(category_df)):

        disc_init = get_initial_discriminator_probs(row, model_d, tokenizer_d)
        
        gc.collect()
        torch.cuda.empty_cache()  
        gen_init = get_initial_generator_probs(row, model_d, tokenizer_d)
        

        gc.collect()
        torch.cuda.empty_cache()
        
        
        candidates_answer = [ choice for i, choice in enumerate(row["choices"])]
        corr_answer.append(candidates_answer[row["answer"]-1])
        
        gen_init_answer.append(max(gen_init["correct"], key=gen_init["correct"].get)) 
        disc_init_answer.append(max(disc_init, key=lambda choice: disc_init[choice]["correct"]))
        
        candidates =  ["A", "B", "C", "D"]

        gen_final, disc_final = equilibrium_search(
            gen_init, disc_init, candidates,
            T=20, eta_G=0.1, eta_D=0.1, lam_G=0.1, lam_D=0.1
        )

        best_answer_g = pick_answer(gen_final, disc_final, candidates, method="generator")
        best_answer_d = pick_answer(gen_final, disc_final, candidates, method="discriminator")
        
        gen_answer.append(best_answer_g)
        disc_answer.append(best_answer_d)
    
    
    category_df["gen_init_answer"] = gen_init_answer
    category_df["corr_answer"] = corr_answer
    category_df["disc_answer"] = disc_answer
    category_df["gen_answer"] = gen_answer
    category_df["disc_init_answer"] = disc_init_answer

    return category_df

def subcategory_df_result_function(df):

    dict_map = {0:"A", 1:"B", 2: "C", 3: "D"}

    df_nrow = df.shape[0]

    df["answer_letter"] = df["answer"].map(dict_map)

    init_comb = (df["gen_init_answer"] == df["disc_init_answer"]).sum()
    init_corr_comb = ((df["gen_init_answer"] == df["disc_init_answer"]) & (df["gen_init_answer"] == df["answer_letter"])).sum()

    comb = (df["gen_answer"] == df["disc_answer"]).sum()
    corr_comb = ((df["gen_answer"] == df["disc_answer"]) & (df["gen_answer"] == df["answer_letter"])).sum()

    init_gen_corr = (df["gen_init_answer"] == df["answer_letter"]).sum()
    gen_corr = (df["gen_answer"] == df["answer_letter"]).sum()

    init_disc_corr = (df["disc_init_answer"] == df["answer_letter"]).sum()
    disc_corr = (df["disc_answer"] == df["answer_letter"]).sum()

    results = {
    "Metric": [
        "Total Question #",
        "Initial correct answers for G",
        "Final correct answers for G",
        "Initial correct answers for D",
        "Final correct answers for D",
        "Initial combined agreement",
        "Final combined agreement",

        "Initial combined correct agreement",
        "Final combined correct agreement"
    ],
    "Count": [
        df_nrow,
        init_gen_corr,
        gen_corr,
        init_disc_corr,
        disc_corr,
        init_comb,
        comb,
        init_corr_comb,
        corr_comb

    ]
    }
    results_df = pd.DataFrame(results)

    results_df["Percentage"] = (results_df["Count"] / df_nrow).apply(lambda x: f"{x:.0%}")

    
    return results_df



In [16]:
gc.collect()
torch.cuda.empty_cache()

if 'model_d' in globals():
    del model_d

if 'tokenizer_d'in globals():
    del tokenizer_d
model_d, tokenizer_d = load_model("meta-llama/Llama-2-13b-hf")

temp_df_llama2 = subcategory_df_function(model_d, tokenizer_d)

file_path = 'Data/mmlu_df_deepseek_Llama2_13b.csv'
temp_df_llama2.to_csv(file_path, index=False)


llama2_df_result = subcategory_df_result_function(temp_df_llama2)
display(llama2_df_result)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100% 13869/13869 [1:55:37<00:00,  2.00it/s] 


Unnamed: 0,Metric,Count,Percentage
0,Total Question #,13869,100%
1,Initial correct answers for G,6978,50%
2,Final correct answers for G,7000,50%
3,Initial correct answers for D,6411,46%
4,Final correct answers for D,6966,50%
5,Initial combined agreement,7393,53%
6,Final combined agreement,13339,96%
7,Initial combined correct agreement,4613,33%
8,Final combined correct agreement,6809,49%


In [17]:

gc.collect()
torch.cuda.empty_cache()

del model_d
del tokenizer_d
model_d, tokenizer_d = load_model("meta-llama/Llama-2-7b-hf")


temp_df_qwen = subcategory_df_function(model_d, tokenizer_d)


file_path = 'Data/mmlu_df_Llama2_7b".csv'
temp_df_qwen.to_csv(file_path, index=False)

qwen_df_result = subcategory_df_result_function(temp_df_qwen)
display(qwen_df_result)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100% 13869/13869 [1:54:43<00:00,  2.01it/s]


Unnamed: 0,Metric,Count,Percentage
0,Total Question #,13869,100%
1,Initial correct answers for G,7404,53%
2,Final correct answers for G,7453,54%
3,Initial correct answers for D,5878,42%
4,Final correct answers for D,7430,54%
5,Initial combined agreement,6119,44%
6,Final combined agreement,13175,95%
7,Initial combined correct agreement,4063,29%
8,Final combined correct agreement,7226,52%


In [16]:

gc.collect()
torch.cuda.empty_cache()

del model_d 
del tokenizer_d 

model_d, tokenizer_d = load_model("Qwen/Qwen2.5-7B-Instruct")


temp_df_oqwen = subcategory_df_function( model_d, tokenizer_d)

file_path = 'Data/mmlu_df_oqwen_7B.csv'
temp_df_oqwen.to_csv(file_path, index=False)

oqwen_df_result = subcategory_df_result_function(temp_df_oqwen)
display(oqwen_df_result)



  warn(
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100% 13869/13869 [1:52:28<00:00,  2.06it/s]


Unnamed: 0,Metric,Count,Percentage
0,Total Question #,13869,100%
1,Initial correct answers for G,9849,71%
2,Final correct answers for G,9838,71%
3,Initial correct answers for D,8698,63%
4,Final correct answers for D,9761,70%
5,Initial combined agreement,9515,69%
6,Final combined agreement,13370,96%
7,Initial combined correct agreement,7667,55%
8,Final combined correct agreement,9607,69%


In [31]:

dict_map = {0:"A", 1:"B", 2: "C", 3: "D"}


temp_df_qwen["answer_letter"] = temp_df_qwen["answer"].map(dict_map)
temp_df_oqwen["answer_letter"] = temp_df_oqwen["answer"].map(dict_map)
temp_df_llama2["answer_letter"] = temp_df_llama2["answer"].map(dict_map)

total_n = temp_df_oqwen.shape[0]

qwen_corr =  (temp_df_qwen["disc_init_answer"] == temp_df_qwen["answer_letter"]).sum()
oqwen_corr = (temp_df_oqwen["disc_init_answer"] == temp_df_oqwen["answer_letter"]).sum()
llama2_corr = (temp_df_llama2["disc_init_answer"] == temp_df_llama2["answer_letter"]).sum()

qwen_oqwen =  ((temp_df_qwen["disc_init_answer"] == temp_df_oqwen["disc_init_answer"]) & (temp_df_oqwen["disc_init_answer"] == temp_df_oqwen["answer_letter"])).sum()
qwen_llama2 =  ((temp_df_qwen["disc_init_answer"] == temp_df_llama2["disc_init_answer"]) & (temp_df_llama2["disc_init_answer"] == temp_df_llama2["answer_letter"])).sum()
oqwen_llama2 =  ((temp_df_oqwen["disc_init_answer"] == temp_df_llama2["disc_init_answer"]) & (temp_df_llama2["disc_init_answer"] == temp_df_llama2["answer_letter"])).sum()
qwen_llama2_oqwen = ((temp_df_oqwen["disc_init_answer"] == temp_df_llama2["disc_init_answer"]) & (temp_df_llama2["disc_init_answer"] == temp_df_llama2["answer_letter"]) & (temp_df_qwen["disc_init_answer"] == temp_df_oqwen["disc_init_answer"])).sum()


results_d = {
"Metric": [
    "Total Question #",
    "oqwen correct answers",
    "qwen correct answers",
    "llama2 correct answers",
    "qwen llama2 oqwen", 
    "qwen oqwen",
    "qwen llama2",
    "oqwen llama2",
],
"Count": [
    total_n,
    oqwen_corr,
    qwen_corr,
    llama2_corr,
    qwen_llama2_oqwen,
    qwen_oqwen,
    qwen_llama2,
    oqwen_llama2,
]  
}
   
results_d = pd.DataFrame(results_d)
results_d["Accuracy"] = (results_d["Count"] / total_n).apply(lambda x: f"{x:.0%}")
results_d

Unnamed: 0,Metric,Count,Accuracy
0,Total Question #,13869,100%
1,oqwen correct answers,8698,63%
2,qwen correct answers,5878,42%
3,llama2 correct answers,6411,46%
4,qwen llama2 oqwen,2957,21%
5,qwen oqwen,4443,32%
6,qwen llama2,3264,24%
7,oqwen llama2,5245,38%


In [32]:
total_n = temp_df_oqwen.shape[0]

qwen_corr =  (temp_df_qwen["gen_init_answer"] == temp_df_qwen["answer_letter"]).sum()
oqwen_corr = (temp_df_oqwen["gen_init_answer"] == temp_df_oqwen["answer_letter"]).sum()
llama2_corr = (temp_df_llama2["gen_init_answer"] == temp_df_llama2["answer_letter"]).sum()

qwen_oqwen =  ((temp_df_qwen["gen_init_answer"] == temp_df_oqwen["gen_init_answer"]) & (temp_df_oqwen["gen_init_answer"] == temp_df_oqwen["answer_letter"])).sum()
qwen_llama2 =  ((temp_df_qwen["gen_init_answer"] == temp_df_llama2["gen_init_answer"]) & (temp_df_llama2["gen_init_answer"] == temp_df_llama2["answer_letter"])).sum()
oqwen_llama2 =  ((temp_df_oqwen["gen_init_answer"] == temp_df_llama2["gen_init_answer"]) & (temp_df_llama2["gen_init_answer"] == temp_df_llama2["answer_letter"])).sum()
qwen_llama2_oqwen = ((temp_df_oqwen["gen_init_answer"] == temp_df_llama2["gen_init_answer"]) & (temp_df_llama2["gen_init_answer"] == temp_df_llama2["answer_letter"]) & (temp_df_qwen["gen_init_answer"] == temp_df_oqwen["gen_init_answer"])).sum()


results_d = {
"Metric": [
    "Total Question #",
    "oqwen correct answers",
    "qwen correct answers",
    "llama2 correct answers",
    "qwen llama2 oqwen", 
    "qwen oqwen",
    "qwen llama2",
    "oqwen llama2",
],
"Count": [
    total_n,
    oqwen_corr,
    qwen_corr,
    llama2_corr,
    qwen_llama2_oqwen,
    qwen_oqwen,
    qwen_llama2,
    oqwen_llama2,
]  
}
   
results_d = pd.DataFrame(results_d)
results_d["Accuracy"] = (results_d["Count"] / total_n).apply(lambda x: f"{x:.0%}")
results_d

Unnamed: 0,Metric,Count,Accuracy
0,Total Question #,13869,100%
1,oqwen correct answers,9849,71%
2,qwen correct answers,7404,53%
3,llama2 correct answers,6978,50%
4,qwen llama2 oqwen,4308,31%
5,qwen oqwen,6482,47%
6,qwen llama2,4610,33%
7,oqwen llama2,5840,42%
