In [1]:
import math
import numpy as np
import google.protobuf
import sentencepiece

In [2]:
import pandas as pd
import os
from huggingface_hub import login
from datasets import load_dataset
import gc
import torch
from collections import defaultdict
from tqdm import tqdm

In [None]:
os.environ["HF_TOKEN"] = "hf_***REDACTED***"
login(os.environ["HF_TOKEN"])
hf_token = os.environ.get('HF_TOKEN')

In [3]:
arc_data = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")

arc_df = arc_data.to_pandas()
arc_df = arc_df.drop_duplicates(subset=['question'])
print(arc_df.shape)
print(arc_df.columns)

(1170, 4)
Index(['id', 'question', 'choices', 'answerKey'], dtype='object')


In [4]:
arc_df["choices_dic"] = arc_df["choices"]
arc_df["choices"] = arc_df["choices"].apply(lambda x: x["text"])
arc_df["subject"] = "science"
arc_df.head(2)

Unnamed: 0,id,question,choices,answerKey,choices_dic,subject
0,Mercury_7175875,An astronomer observes that a planet rotates f...,"[Planetary density will decrease., Planetary y...",C,"{'text': ['Planetary density will decrease.', ...",science
1,Mercury_SC_409171,A group of engineers wanted to know how differ...,"[buildings will be built faster, buildings wil...",B,"{'text': ['buildings will be built faster', 'b...",science


In [6]:
# Check if CUDA is available
torch.cuda.empty_cache()
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessorList, MinLengthLogitsProcessor
import accelerate
print(accelerate.__version__)  # Should show ≥0.26.0
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

import re
import math

1.4.0.dev0


In [7]:
################################
# 1. INITIAL GENERATOR POLICIES
################################

def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s


def build_generator_prompt(
    subject,
    target_question,
    target_choices,
    get_correct
):
    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
        format_subject(subject))

    prompt += f"{target_question}"
    for i, c in enumerate(target_choices):
        prompt += "\n{}".format(c)
        
    if get_correct:
        prompt += "\nAnswer:"
    else:
        prompt += "\nIncorrect Answer:"
    return prompt


def get_generator_answer_probs(model, tokenizer, prompt_text, choices_list):
    input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids.to(model.device)
    logits = model(input_ids=input_ids).logits[0, -1]


    choices = [f"{chr(65+i)}" for i, choice in enumerate(choices_list)]
    choice_logits = []
    for letter in choices:
        token_id = tokenizer(letter, return_tensors="pt").input_ids[0, -1].item()
        choice_logits.append(logits[token_id].item())
    
    
    choice_logits = torch.tensor(choice_logits, device=model.device).float()
    probs = torch.nn.functional.softmax(choice_logits, dim=0).detach().cpu().numpy()
    
    
    choice_probs =  {choice: prob for choice, prob in zip(choices, probs)}
    
    return choice_probs



def generator_probs(subject, question, choices_list, get_correct, model, tokenizer):
    # Generate the letter answer
    choices = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices_list)]

    prompt = build_generator_prompt(subject, question, choices, get_correct)
    
    probs = get_generator_answer_probs(model, tokenizer, prompt, choices_list)
    
    return probs 



def get_initial_generator_probs(row, model, tokenizer):
    gen_init = {"correct": {}, "incorrect": {}}
    x, y_list, subject = row["question"], row["choices"], row["subject"]
    for v in [True, False]:
        choices_letter_prob = generator_probs(subject, x, y_list, v, model, tokenizer)
        if v:
            for key, val in choices_letter_prob.items():
                gen_init["correct"][key] = val
                #print(gen_init.items())
        else:
            for key, val in choices_letter_prob.items():
                gen_init["incorrect"][key] = val

    return gen_init


In [8]:
###################################
# 2. INITIAL DISCRIMINATOR POLICIES
###################################

def build_discriminator_prompt(
    subject: str,
    question: str,
    proposed_answer: str
) -> str:
    """Builds a prompt to evaluate answer correctness."""
    prompt = f"""You are an expert evaluator of questions about {format_subject(subject)}. 
Determine if the proposed answer is correct. Output ONLY 'A' or 'B'.
Question: {question}
Proposed Answer: {proposed_answer}

Is this answer correct? Respond ONLY with:
A. Correct
B. Incorrect

Answer:
"""
    return prompt


    
def get_discriminator_probs(
    model,
    tokenizer,
    prompt_text,
    choices_list
):
    input_ids = input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids.to(model.device)
    
    logits = model(input_ids=input_ids).logits[0, -1]

    choice_logits = torch.tensor(
        [
            logits[tokenizer("A").input_ids[-1]],
            logits[tokenizer("B").input_ids[-1]],
        ]
    ).float()
    
    disc_dict = {"A":"correct", "B":"incorrect"}
    probs = torch.nn.functional.softmax(choice_logits, dim=0).detach().cpu().numpy()

    choices = [f"{chr(65+i)}" for i, choice in enumerate(choices_list)]
    choice_probs =  {disc_dict[choice]: prob for choice, prob in zip(choices, probs)}

    return choice_probs


def evaluate_answer_correctness(
    row,
    model,
    tokenizer
):
    """Evaluates all possible answers for a question."""
    subject = row["subject"]
    question = row["question"]
    choices = row["choices"]
    
    results = {}
    
    for idx, answer in enumerate(choices):
        prompt = build_discriminator_prompt(
            subject=subject,
            question=question,
            proposed_answer=f"{answer}"
        )
        
        probs = get_discriminator_probs(model, tokenizer, prompt, choices)
        
        
        disc_dict_answer =  {i: f"{chr(65+i)}" for i, choice in enumerate(row["choices"])}
        
        
        results[disc_dict_answer[idx]] = probs
    

    return results

def get_initial_discriminator_probs(
    row,
    model,
    tokenizer
):
    disc_init = evaluate_answer_correctness(row, model, tokenizer)
    

    return disc_init




In [9]:
def pick_answer(gen, disc, candidates, method="generator"):
    """
    method='generator': pick argmax_y pi_G(correct|y)
    method='discriminator': pick argmax_y pi_D(correct|y)
    """
    if method == "generator":
        # For each candidate y, we look at gen["correct"][y].
        best_y = None
        best_prob = -1.0
        for y in candidates:
            p = gen["correct"][y]
            if p > best_prob:
                best_prob = p
                best_y = y
        return best_y
    else:
        # method='discriminator'
        best_y = None
        best_prob = -1.0
        for y in candidates:
            p = disc[y]["correct"]
            if p > best_prob:
                best_prob = p
                best_y = y
        return best_y

    

def softmax(arr):
    """Numerically stable softmax over a 1D numpy array."""
    m = np.max(arr)
    exp_vals = np.exp(arr - m)
    return exp_vals / np.sum(exp_vals)


def equilibrium_search(gen_init, disc_init, 
                       candidates, 
                       T=5000, 
                       eta_G=0.1, eta_D=0.1, 
                       lam_G=0.1, lam_D=0.01):
    """
    Runs iterative no-regret policy updates to find approximate equilibrium.
    gen_init, disc_init: dictionary form from the above initialization steps.
    """
    # Convert these dicts into np arrays for speed if you like.
    # But for clarity, we'll just keep dict form.

    gen = {"correct": dict(gen_init["correct"]), 
           "incorrect": dict(gen_init["incorrect"])}
    disc = {}
    for y in candidates:
        disc[y] = dict(disc_init[y])  # copy

    Qg = {"correct": {y: 0.0 for y in candidates}, 
          "incorrect": {y: 0.0 for y in candidates}}
    Qd = {y: {"correct": 0.0, "incorrect": 0.0} for y in candidates}

    for t in range(1, T+1):
        # 1) Update Q
        for v in ["correct", "incorrect"]:
            for y in candidates:
                
                Qg[v][y] += (1.0/(2.0*t)) * disc[y][v]

        for y in candidates:
            for v in ["correct", "incorrect"]:
                
                Qd[y][v] += (1.0/(2.0*t)) * gen[v][y]

        # 2) Update generator policy
        for v in ["correct", "incorrect"]:
            logits = []
            for y in candidates:
                val = (Qg[v][y] + lam_G * math.log(gen_init[v][y] + 1e-12) )/ (1/eta_G  + lam_G)
                logits.append(val)

            new_probs = softmax(np.array(logits))

            for i, y in enumerate(candidates):
                gen[v][y] = new_probs[i]
        logits_correct = []
        logits_incorrect = []
        for y in candidates:
            # Logit for "correct"
            val_correct = (Qd[y]["correct"] + lam_D * math.log(disc_init[y]["correct"] + 1e-12)) / (1/eta_D + lam_D)
            logits_correct.append(val_correct)

            # Logit for "incorrect"
            val_incorrect = (Qd[y]["incorrect"] + lam_D * math.log(disc_init[y]["incorrect"] + 1e-12)) / (1/eta_D + lam_D)
            logits_incorrect.append(val_incorrect)

        # Apply softmax across all candidates for each class
        new_probs_correct = softmax(np.array(logits_correct))
        new_probs_incorrect = softmax(np.array(logits_incorrect))


        for i, y in enumerate(candidates):
            disc[y]["correct"] = new_probs_correct[i]
            disc[y]["incorrect"] = new_probs_incorrect[i]

    return gen, disc


In [10]:

def load_model(model_name):
    """Load one model at a time with 4-bit quantization"""
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        load_in_8bit=False,
        low_cpu_mem_usage=True,
        device_map="cuda",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    return model, tokenizer




In [13]:

def subcategory_df_function(model_d, tokenizer_d, df):
    
    category_df = df.copy()

    gen_answer = []
    disc_answer = []
    gen_init_answer = []
    disc_init_answer = []
    disc_init_policy = []
    gen_init_policy = []
    
    disc_init_policy = []
    gen_init_policy = []
    
    disc_final_policy_consensus = []
    gen_final_policy_consensus = []
    

    for _, row in tqdm(category_df.iterrows(), total=len(category_df)):

        disc_init = get_initial_discriminator_probs(row, model_d, tokenizer_d)
        disc_init_policy.append(disc_init)
        gc.collect()
        torch.cuda.empty_cache()  
        gen_init = get_initial_generator_probs(row, model_d, tokenizer_d)
        
        gen_init_policy.append(gen_init)
        gc.collect()
        torch.cuda.empty_cache()
        
        gen_init_answer.append(max(gen_init["correct"], key=gen_init["correct"].get))
       
        disc_init_answer.append(max(disc_init, key=lambda choice: disc_init[choice]["correct"]))
        
        candidates =  [f"{chr(65+i)}" for i, choice in enumerate(row["choices"])]


        gen_final, disc_final = equilibrium_search(
            gen_init, disc_init, candidates,
            T=20, eta_G=0.1, eta_D=0.1, lam_G=0.1, lam_D=0.1
        )
        disc_final_policy_consensus.append( disc_final)
        gen_final_policy_consensus.append(gen_final)

        best_answer_g = pick_answer(gen_final, disc_final, candidates, method="generator")
        best_answer_d = pick_answer(gen_final, disc_final, candidates, method="discriminator")
        
        gen_answer.append(best_answer_g)
        disc_answer.append(best_answer_d)
    
    
    category_df["gen_init_answer"] = gen_init_answer
    category_df["disc_answer"] = disc_answer
    category_df["gen_answer"] = gen_answer
    category_df["disc_init_answer"] = disc_init_answer
    category_df["disc_final_policy_consensus"] = disc_final_policy_consensus
    category_df["disc_init_policy"] = disc_init_policy
    category_df["gen_init_policy"] = gen_init_policy
    category_df["gen_final_policy_consensus"] = gen_final_policy_consensus

    
    return category_df


In [3]:
gc.collect()
torch.cuda.empty_cache()

if 'model_d' in globals():
    del model_d

if 'tokenizer_d'in globals():
    del tokenizer_d
model_d, tokenizer_d = load_model("meta-llama/Llama-2-13b-hf")

In [21]:

temp_df_llama2 = subcategory_df_function(model_d, tokenizer_d, arc_df)

file_path = 'Data/arc_policy_df_Llama2_13b.csv'
temp_df_llama2.to_csv(file_path, index=False)

100% 1170/1170 [09:23<00:00,  2.08it/s]


In [22]:

gc.collect()
torch.cuda.empty_cache()

del model_d
del tokenizer_d
model_d, tokenizer_d = load_model("meta-llama/Llama-2-7b-hf")


temp_df_qwen = subcategory_df_function(model_d, tokenizer_d, arc_df)


file_path = 'Data/arc_policy_df_Llama2_7b.csv'
temp_df_qwen.to_csv(file_path, index=False)


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100% 1170/1170 [09:12<00:00,  2.12it/s]


In [23]:

gc.collect()
torch.cuda.empty_cache()

del model_d 
del tokenizer_d 

model_d, tokenizer_d = load_model("Qwen/Qwen2.5-7B-Instruct")

temp_df_oqwen = subcategory_df_function( model_d, tokenizer_d, arc_df)

file_path = 'Data/arc_policy_df_oqwen_7B.csv'
temp_df_oqwen.to_csv(file_path, index=False)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100% 1170/1170 [09:15<00:00,  2.11it/s]


In [15]:
from datasets import load_dataset

arc_data_easy = load_dataset("allenai/ai2_arc", "ARC-Easy", split = "test")

arc_df_easy = arc_data_easy.to_pandas()

arc_df_easy = arc_df_easy.drop_duplicates(subset=['question'])
arc_df_easy["choices_dic"] = arc_df_easy["choices"]
arc_df_easy["choices"] = arc_df_easy["choices"].apply(lambda x: x["text"])
arc_df_easy["subject"] = "science"

(2371, 4)
Index(['id', 'question', 'choices', 'answerKey'], dtype='object')


Unnamed: 0,id,question,choices,answerKey,choices_dic,subject
0,Mercury_417466,Which statement best explains why photosynthes...,[Sunlight is the source of energy for nearly a...,A,{'text': ['Sunlight is the source of energy fo...,science
1,Mercury_7081673,Which piece of safety equipment is used to kee...,"[safety goggles, breathing mask, rubber gloves...",B,"{'text': ['safety goggles', 'breathing mask', ...",science


In [16]:
gc.collect()
torch.cuda.empty_cache()

if 'model_d' in globals():
    del model_d

if 'tokenizer_d'in globals():
    del tokenizer_d
model_d, tokenizer_d = load_model("meta-llama/Llama-2-7b-hf")
temp_df_llama2_arc_easy = subcategory_df_function(model_d, tokenizer_d, arc_df_easy)

file_path = 'Data/arc_policy_df_easy_Llama2_7b.csv'
temp_df_llama2_arc_easy.to_csv(file_path, index=False)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100% 2371/2371 [18:36<00:00,  2.12it/s]


In [19]:


gc.collect()
torch.cuda.empty_cache()

del model_d 
del tokenizer_d 

model_d, tokenizer_d = load_model("Qwen/Qwen2.5-7B-Instruct")


temp_df_oqwen_arc_easy = subcategory_df_function( model_d, tokenizer_d, arc_df_easy)

file_path = 'Data/arc_policy_df_easy_oqwen_7B.csv'
temp_df_oqwen_arc_easy.to_csv(file_path, index=False)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100% 2371/2371 [19:32<00:00,  2.02it/s]
