In [64]:
from matplotlib import pyplot as plt
import os
import numpy as np
import json


file_dir = "/data03/sunyi/time_constrained_cot/outputs/2_6"
model_list = [
    "NovaSky-AI/Sky-T1-32B-Preview",
    "Qwen/QwQ-32B-Preview",
    "Qwen/Qwen2.5-32B-Instruct",
    "Qwen/Qwen2.5-14B-Instruct", 
    "Qwen/Qwen2.5-7B-Instruct", 
    "Qwen/Qwen2.5-3B-Instruct", 
    "Qwen/Qwen2.5-1.5B-Instruct",
    "Qwen/Qwen2.5-Math-1.5B-Instruct",
    "Qwen/Qwen2.5-Math-7B-Instruct",
    "mistralai/Mistral-Small-Instruct-2409",
    "mistralai/Mistral-Nemo-Instruct-2407",
    "mistralai/Ministral-8B-Instruct-2410",
    "mistralai/Mathstral-7B-v0.1",
    "google/gemma-2-27b-it",
    "google/gemma-2-9b-it",
    "google/gemma-2-2b-it",
    "microsoft/phi-4",
    "microsoft/Phi-3-medium-128k-instruct",
    "microsoft/Phi-3-small-128k-instruct",
    "microsoft/Phi-3-mini-128k-instruct",
    "microsoft/Phi-3.5-mini-instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.1-8B-Instruct"
]

In [65]:
PROMP_LIST = [ "-aav", "-c2f", "-sbs"]
# PROMP_LIST = ["-sbs-hard"]


MODEL_SERIES_MAP = {
    "Qwen/QwQ-32B-Preview": "qwen",
    "Qwen/Qwen2.5-32B-Instruct": "qwen",
    "Qwen/Qwen2.5-14B-Instruct": "qwen",
    "Qwen/Qwen2.5-7B-Instruct": "qwen",
    "Qwen/Qwen2.5-3B-Instruct": "qwen",
    "Qwen/Qwen2.5-1.5B-Instruct": "qwen",
    "Qwen/Qwen2.5-Math-1.5B-Instruct": "qwen-math",
    "Qwen/Qwen2.5-Math-7B-Instruct": "qwen-math",
    "internlm/internlm2_5-1_8b-chat": "internlm",
    "internlm/internlm2_5-7b-chat": "internlm",
    "internlm/internlm2_5-20b-chat": "internlm",
    "google/gemma-2-2b-it": "gemma",
    "google/gemma-2-9b-it": "gemma",
    "google/gemma-2-27b-it": "gemma",
    "mistralai/Mathstral-7B-v0.1": "mistral",
    "mistralai/Ministral-8B-Instruct-2410": "mistral",
    "mistralai/Mistral-Nemo-Instruct-2407": "mistral",
    "mistralai/Mistral-Small-Instruct-2409": "mistral",
    "microsoft/phi-4": "phi4",
    "microsoft/Phi-3-medium-128k-instruct": "phi3medium",
    "microsoft/Phi-3-small-128k-instruct": "phi3small",
    "microsoft/Phi-3.5-mini-instruct": "phi3mini",
    "microsoft/Phi-3-mini-128k-instruct": "phi3mini",
    "NovaSky-AI/Sky-T1-32B-Preview": "qwen",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": "deepseek-r1-distill",
    "meta-llama/Llama-3.2-3B-Instruct": "llama",
    "meta-llama/Llama-3.2-1B-Instruct": "llama",
    "meta-llama/Llama-3.1-8B-Instruct": "llama"
}


MODEL_SERIES_PROMPT_TYPE_MAP = {
    "qwen": ["qwen" + prompt for prompt in PROMP_LIST],
    "qwen-math": ["qwen-math" + prompt for prompt in PROMP_LIST],
    "internlm": ["internlm"+prompt for prompt in PROMP_LIST],
    "mistral": ["mistral"+prompt for prompt in PROMP_LIST],
    "gemma": ["gemma"+prompt for prompt in PROMP_LIST],
    "phi3mini": ["phi3mini"+prompt for prompt in PROMP_LIST],
    "phi3small": ["phi3small"+prompt for prompt in PROMP_LIST],
    "phi3medium": ["phi3medium"+prompt for prompt in PROMP_LIST],
    "phi4": ["phi4"+prompt for prompt in PROMP_LIST],
    "deepseek-r1-distill": ["deepseek-r1-distill"+prompt for prompt in PROMP_LIST],
    "llama": ["llama"+prompt for prompt in PROMP_LIST]
}

In [66]:
def gen_budget_list(budget, data_name, model):
    if budget == -1:
        return [-1]
    elif budget == 1:
        o1_like_models = [
            "Qwen/QwQ-32B-Preview", 
            "Skywork/Skywork-o1-Open-Llama-3.1-8B", 
            "PowerInfer/SmallThinker-3B-Preview",
            "NovaSky-AI/Sky-T1-32B-Preview", 
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
        ]
        if model in o1_like_models: # maybe should extend to longer sequence
            if data_name == "gsm8k":
                budget_list = []
                for i in range(25, 300, 25):
                    budget_list.append(i)
                for i in range(300, 600, 50):
                    budget_list.append(i)
                for i in range(600, 1201, 100):
                    budget_list.append(i)
                # budget_list.append(4096)
                # budget_list.append(8192)
            elif data_name in ["math", "math500"]:
                budget_list = []
                for i in range(25, 600, 25):
                    budget_list.append(i)
                for i in range(600, 2401, 100):
                    budget_list.append(i)
                # budget_list.append(4096)
                # budget_list.append(8192)
        else:    
            if data_name == "gsm8k":
                budget_list = []
                for i in range(25, 300, 25):
                    budget_list.append(i)
                for i in range(300, 601, 50):
                    budget_list.append(i)
                # budget_list.append(4096)
            elif data_name in ["math", "math500"]:
                budget_list = []
                for i in range(25, 300, 25):
                    budget_list.append(i)
                for i in range(300, 600, 50):
                    budget_list.append(i)
                for i in range(600, 1201, 100):
                    budget_list.append(i)
                # budget_list.append(4096)

        
        return budget_list

In [67]:
dataset = "gsm8k"   
something = "_-1_seed0_t0.0_s0_e-1"

In [68]:
model_acc_dicts = {}

o1_like_models = [
    "Qwen/QwQ-32B-Preview", 
    "Skywork/Skywork-o1-Open-Llama-3.1-8B", 
    "PowerInfer/SmallThinker-3B-Preview",
    "NovaSky-AI/Sky-T1-32B-Preview", 
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
]

for model in model_list:
    model_acc_dict = {}
    budget_list = gen_budget_list(1, dataset, model)
    model_prompt_list = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    for prompt_type in model_prompt_list:
        acc_list = []
        for budget in budget_list:
            file_name = "test_" + prompt_type + something + "_b" + str(int(budget)) + "_metrics.json"
            file_path = os.path.join(file_dir, model, prompt_type, dataset, file_name)
            with open(file_path, "r") as f:
                acc_list.append(json.load(f)["acc"])
        model_acc_dict[prompt_type] = acc_list
    model_acc_dicts[model] = model_acc_dict

model_acc_dicts

{'NovaSky-AI/Sky-T1-32B-Preview': {'qwen-aav': [39.3,
   46.6,
   59.2,
   69.5,
   77.9,
   83.7,
   88.5,
   91.1,
   92.6,
   93.8,
   94.6,
   94.5,
   94.9,
   95.0,
   95.1,
   94.8,
   95.1,
   95.1,
   94.9,
   94.8,
   95.0,
   95.0,
   95.0,
   95.1],
  'qwen-c2f': [38.7,
   44.6,
   50.2,
   56.0,
   62.9,
   70.0,
   76.1,
   82.6,
   85.4,
   87.6,
   90.0,
   90.8,
   92.6,
   93.6,
   94.5,
   94.7,
   94.6,
   94.8,
   95.3,
   95.1,
   95.2,
   95.4,
   95.5,
   95.5],
  'qwen-sbs': [37.3,
   38.4,
   40.7,
   44.0,
   50.9,
   58.5,
   65.0,
   72.6,
   77.6,
   81.4,
   85.8,
   89.0,
   92.6,
   94.2,
   95.6,
   96.0,
   95.9,
   95.8,
   95.9,
   95.9,
   96.1,
   96.2,
   96.1,
   96.1]},
 'Qwen/QwQ-32B-Preview': {'qwen-aav': [38.5,
   40.3,
   45.6,
   49.5,
   53.1,
   60.9,
   66.9,
   73.3,
   78.1,
   81.0,
   84.8,
   88.4,
   91.1,
   93.1,
   94.0,
   94.6,
   94.9,
   94.8,
   95.1,
   95.5,
   95.6,
   95.2,
   95.3,
   95.5],
  'qwen-c2f': [36.4,
   41

In [69]:


import pandas as pd

# Create dataframes for each budget point
budgets = [50, 150]
# budgets = [4096, 8192]
results = {}

# for budget in budgets:
#     data = []
#     for model in model_list:
#         model_prompt_list = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
#         check_exist = False
#         if model in o1_like_models:
#             for prompt_type in model_prompt_list:
#                 acc = model_acc_dicts[model][prompt_type][int(budget/4096 - 3)]
#                 model_name = model.split('/')[-1]
#                 if not check_exist:
#                     data.append({
#                         'Model': model_name,
#                         # 'Prompt Type': prompt_type,
#                         'Budget': budget,
#                         'Accuracy' + " " + PROMP_LIST[model_prompt_list.index(prompt_type)]: acc
#                     })
#                 else:
#                     data[-1]['Accuracy' + " " + PROMP_LIST[model_prompt_list.index(prompt_type)]]= acc
#                 check_exist = True
#             results[budget] = pd.DataFrame(data)
#         else:
#             for prompt_type in model_prompt_list:
#                 acc = model_acc_dicts[model][prompt_type][-1]
#                 model_name = model.split('/')[-1]
#                 if not check_exist:
#                     data.append({
#                         'Model': model_name,
#                         # 'Prompt Type': prompt_type,
#                         'Budget': budget,
#                         'Accuracy' + " " + PROMP_LIST[model_prompt_list.index(prompt_type)]: acc
#                     })
#                 else:
#                     data[-1]['Accuracy' + " " + PROMP_LIST[model_prompt_list.index(prompt_type)]]= acc
#                 check_exist = True
#             results[budget] = pd.DataFrame(data)

for budget in budgets:
    data = []
    model_data = {}
    
    for model in model_list:
        model_name = model.split('/')[-1]
        model_data[model_name] = {'Model': model_name, 'Budget': budget}
        
        model_prompt_list = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
        for prompt_type in model_prompt_list:
            acc = model_acc_dicts[model][prompt_type][int(budget/25 - 1)]
            model_data[model_name]['Accuracy' + " " + PROMP_LIST[model_prompt_list.index(prompt_type)]] = acc
        
        data.append(model_data[model_name])
            
    results[budget] = pd.DataFrame(data)

# Display tables
for budget in budgets:
    print(f"\nAccuracy Table for {budget} tokens:")
    print(results[budget].to_string(index=False))
    
    # Save to CSV
    results[budget].to_csv(f'accuracy_table_{budget}.csv', index=False)


Accuracy Table for 50 tokens:
                        Model  Budget  Accuracy -aav  Accuracy -c2f  Accuracy -sbs
           Sky-T1-32B-Preview      50           46.6           44.6           38.4
              QwQ-32B-Preview      50           40.3           41.0           36.2
         Qwen2.5-32B-Instruct      50           57.6           47.8           36.2
         Qwen2.5-14B-Instruct      50           46.3           40.4           24.3
          Qwen2.5-7B-Instruct      50           46.4           35.0           20.5
          Qwen2.5-3B-Instruct      50           13.4           21.5           12.4
        Qwen2.5-1.5B-Instruct      50           21.2           19.8            9.8
   Qwen2.5-Math-1.5B-Instruct      50           16.7           15.0           15.9
     Qwen2.5-Math-7B-Instruct      50           29.4           30.9           29.6
  Mistral-Small-Instruct-2409      50           31.8           23.1            6.6
   Mistral-Nemo-Instruct-2407      50           24.2    

In [70]:
# for prompt in ["-quick", "-direct", "-sbs", "-c2f", "-aav"]:
    
    
#     # Create a 2x3 subplot
#     fig, axs = plt.subplots(3, 6, figsize=(20, 10))
#     marker_size = 5  # Set the desired marker size
#     line_width = 2   # Set the desired line width

#     # Flatten the axes array for easy iteration
#     axs = axs.flatten()

#     # Plot all lines first without labels
#     for index, model in enumerate(model_list):
#         budget_list = gen_budget_list(1, dataset, model)
#         model_prompt_list = [MODEL_SERIES_MAP[model] + prompt, MODEL_SERIES_MAP[model] + prompt + "-hard"]
#         for prompt_type in model_prompt_list:
#             if "sbs-hard" in prompt_type:
#                 axs[index].plot(budget_list[1:], model_acc_dicts[model][prompt_type][1:], linestyle='--', color="red", label="sbs-hard", linewidth=line_width)
#             elif "direct-hard" in prompt_type:
#                 axs[index].plot(budget_list[1:], model_acc_dicts[model][prompt_type][1:], linestyle='--', color="orange", label="direct-hard", linewidth=line_width)
#             elif "quick-hard" in prompt_type:
#                 axs[index].plot(budget_list[1:], model_acc_dicts[model][prompt_type][1:], linestyle='--', color="pink", label="quick-hard", linewidth=line_width)
#             elif "c2f-hard" in prompt_type:
#                 axs[index].plot(budget_list[1:], model_acc_dicts[model][prompt_type][1:], linestyle='--', color="green", label="c2f-hard", linewidth=line_width)
#             elif "aav-hard" in prompt_type:
#                 axs[index].plot(budget_list[1:], model_acc_dicts[model][prompt_type][1:], linestyle='--', color="purple", label="aav-hard", linewidth=line_width)
#             elif "sbs" in prompt_type:
#                 axs[index].plot([budget + 25 for budget in budget_list], model_acc_dicts[model][prompt_type], linestyle='-', color="red", label="sbs", linewidth=line_width)
#             elif "direct" in prompt_type:
#                 axs[index].plot([budget + 25 for budget in budget_list], model_acc_dicts[model][prompt_type], linestyle='-', color="orange", label="direct", linewidth=line_width)
#             elif "quick" in prompt_type:
#                 axs[index].plot([budget + 25 for budget in budget_list], model_acc_dicts[model][prompt_type], linestyle='-', color="pink", label="quick", linewidth=line_width)
#             elif "c2f" in prompt_type:
#                 axs[index].plot([budget + 25 for budget in budget_list], model_acc_dicts[model][prompt_type], linestyle='-', color="green", label="c2f", linewidth=line_width)
#             # elif "kf" in prompt_type:
#             #     axs[index].plot([budget + 25 for budget in budget_list], model_acc_dicts[model][prompt_type], linestyle='-', color="blue", label="kf", linewidth=line_width)
#             elif "aav" in prompt_type:
#                 axs[index].plot([budget + 25 for budget in budget_list], model_acc_dicts[model][prompt_type], linestyle='-', color="purple", label="aav", linewidth=line_width)
            
#             axs[index].set_xlabel("Token Budget")
#             axs[index].set_ylabel("Accuracy")
#             axs[index].set_title(f"{model.split('/')[-1]}")
#             axs[index].legend()
            
#             # Set y-axis and x-axis ticks
#             axs[index].set_yticks(range(0, 91, 10))  # Set y-axis interval to 10
#             # axs[index].set_xticks(range(0, 551, 50))  # Set x-axis interval to 50

#     # set the title
#     plt.suptitle(f"{dataset}".upper())
#     # Adjust layout
#     plt.tight_layout()