In [185]:
from matplotlib import pyplot as plt
import os
import numpy as np
import json
import pandas as pd

file_dir = "/data03/sunyi/time_constrained_cot/outputs/1_10"

o1_like_models = [
    "Qwen/QwQ-32B-Preview", 
    # "Skywork/Skywork-o1-Open-Llama-3.1-8B", 
    # "PowerInfer/SmallThinker-3B-Preview",
    "NovaSky-AI/Sky-T1-32B-Preview", 
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    # "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
]

no_o1_models = [
    "Qwen/Qwen2.5-32B-Instruct", 
    "Qwen/Qwen2.5-14B-Instruct", 
    "Qwen/Qwen2.5-7B-Instruct", 
    "Qwen/Qwen2.5-3B-Instruct", 
    "Qwen/Qwen2.5-1.5B-Instruct",
    "Qwen/Qwen2.5-Math-1.5B-Instruct",
    "Qwen/Qwen2.5-Math-7B-Instruct",
    
    "mistralai/Mistral-Small-Instruct-2409",
    "mistralai/Mistral-Nemo-Instruct-2407",
    "mistralai/Ministral-8B-Instruct-2410",
    "mistralai/Mathstral-7B-v0.1",
    
    "google/gemma-2-27b-it",
    "google/gemma-2-9b-it",
    "google/gemma-2-2b-it",
    
    "microsoft/phi-4",
    "microsoft/Phi-3-medium-128k-instruct",
    "microsoft/Phi-3-small-128k-instruct",
    "microsoft/Phi-3-mini-128k-instruct",
    "microsoft/Phi-3.5-mini-instruct",
]

model_list = o1_like_models + no_o1_models

# PROMP_LIST = [ "-sbs", "-sbs-hard"]

PROMP_LIST = [ "-aav", "-c2f", "-sbs", "-direct", "-quick", "-aav-hard", "-c2f-hard", "-sbs-hard", "-direct-hard", "-quick-hard"]
PROMP_HARD_LIST = ["-aav-hard", "-c2f-hard", "-sbs-hard", "-direct-hard", "-quick-hard"]

In [186]:
MODEL_SERIES_MAP = {
    "NovaSky-AI/Sky-T1-32B-Preview": "qwen",
    "Qwen/QwQ-32B-Preview": "qwen",
    "Qwen/Qwen2.5-32B-Instruct": "qwen",
    "Qwen/Qwen2.5-14B-Instruct": "qwen",
    "Qwen/Qwen2.5-7B-Instruct": "qwen",
    "Qwen/Qwen2.5-3B-Instruct": "qwen",
    "Qwen/Qwen2.5-1.5B-Instruct": "qwen",
    "Qwen/Qwen2.5-Math-1.5B-Instruct": "qwen-math",
    "Qwen/Qwen2.5-Math-7B-Instruct": "qwen-math",
    
    "google/gemma-2-2b-it": "gemma",
    "google/gemma-2-9b-it": "gemma",
    "google/gemma-2-27b-it": "gemma",
    
    "mistralai/Mathstral-7B-v0.1": "mistral",
    "mistralai/Ministral-8B-Instruct-2410": "mistral",
    "mistralai/Mistral-Nemo-Instruct-2407": "mistral",
    "mistralai/Mistral-Small-Instruct-2409": "mistral",
    
    "microsoft/phi-4": "phi4",
    "microsoft/Phi-3-medium-128k-instruct": "phi3medium",
    "microsoft/Phi-3-small-128k-instruct": "phi3small",
    "microsoft/Phi-3.5-mini-instruct": "phi3mini",
    "microsoft/Phi-3-mini-128k-instruct": "phi3mini",
    
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": "deepseek-r1-distill",
}


MODEL_SERIES_PROMPT_TYPE_MAP = {
    "qwen": ["qwen" + prompt for prompt in PROMP_LIST],
    "qwen-math": ["qwen-math" + prompt for prompt in PROMP_LIST],
    "mistral": ["mistral"+prompt for prompt in PROMP_LIST],
    "gemma": ["gemma"+prompt for prompt in PROMP_LIST],
    "phi3mini": ["phi3mini"+prompt for prompt in PROMP_LIST],
    "phi3small": ["phi3small"+prompt for prompt in PROMP_LIST],
    "phi3medium": ["phi3medium"+prompt for prompt in PROMP_LIST],
    "phi4": ["phi4"+prompt for prompt in PROMP_LIST],
    "deepseek-r1-distill": ["deepseek-r1-distill"+prompt for prompt in PROMP_LIST],
}

In [187]:

def gen_budget_list(budget, data_name, model, prompt_type):
    if budget == -1:
        return [-1]
    elif budget == 1:
        o1_like_models = [
            "Qwen/QwQ-32B-Preview", 
            "Skywork/Skywork-o1-Open-Llama-3.1-8B", 
            "PowerInfer/SmallThinker-3B-Preview",
            "NovaSky-AI/Sky-T1-32B-Preview", 
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
        ]
        if model in o1_like_models: # maybe should extend to longer sequence
            if data_name == "gsm8k":
                budget_list = []
                for i in range(25, 600, 25):
                    budget_list.append(i)
                for i in range(600, 1201, 50):
                    budget_list.append(i)
            elif data_name in ["math", "math500"]:
                budget_list = []
                for i in range(25, 600, 25):
                    budget_list.append(i)
                for i in range(600, 2401, 50):
                    budget_list.append(i)
        else:    
            if data_name == "gsm8k":
                budget_list = []
                for i in range(25, 601, 25):
                    budget_list.append(i)
            elif data_name in ["math", "math500"]:
                budget_list = []
                for i in range(25, 600, 25):
                    budget_list.append(i)
                for i in range(600, 1201, 50):
                    budget_list.append(i)
        if "hard" in prompt_type:
            budget_list.append(4096)
            if model in o1_like_models:
                budget_list.append(8192)
                budget_list.append(12288)
                budget_list.append(16384)
        
        return budget_list

In [188]:
dataset = "gsm8k"
something = "_-1_seed0_t0.0_s0_e-1"

In [189]:
model_acc_dicts = {}
prompt_acc_dicts = {}

for model in model_list:
    model_acc_dict = {}
    budget_list = gen_budget_list(1, dataset, model)
    model_prompt_list = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    for prompt_type in model_prompt_list:
        acc_list = []
        for budget in budget_list:
            file_name = "test_" + prompt_type + something + "_b" + str(int(budget)) + "_metrics.json"
            file_path = os.path.join(file_dir, model, prompt_type, dataset, file_name)
            with open(file_path, "r") as f:
                acc_list.append(json.load(f)["acc"])
        model_acc_dict[prompt_type] = acc_list
    model_acc_dicts[model] = model_acc_dict

for prompt_type in PROMP_LIST:
    prompt_acc_dict = {}
    for model in model_list:
        prompt_acc_dict[model] = model_acc_dicts[model][MODEL_SERIES_MAP[model] + prompt_type]
    prompt_acc_dicts[prompt_type[1:]] = prompt_acc_dict

display(model_acc_dicts)
display(prompt_acc_dicts)

In [190]:
dir1 = "o1-like-models"
dir2 = "no-o1-models"
os.makedirs(os.path.join(file_dir, dataset, "different-prompts/" + dir1), exist_ok=True)
os.makedirs(os.path.join(file_dir, dataset, "different-prompts/" + dir2), exist_ok=True)
os.makedirs(os.path.join(file_dir, dataset, "different-models/" + dir1), exist_ok=True)
os.makedirs(os.path.join(file_dir, dataset, "different-models/" + dir2), exist_ok=True)


for prompt_type in PROMP_LIST:
    file_name = prompt_type[1:] + ".csv"

    
    # For o1-like models
    rows = []
    budget_list = gen_budget_list(1, dataset, o1_like_models[0])
    for model in o1_like_models:
        acc_list = prompt_acc_dicts[prompt_type[1:]][model]
        rows.append(acc_list)
    df = pd.DataFrame(rows, 
                     index=[model.split("/")[-1] for model in o1_like_models],      # 使用模型名称作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-prompts/" + dir1, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-prompts/" + dir1, file_name))
    
    # For no-o1 models
    rows = []
    budget_list = gen_budget_list(1, dataset, no_o1_models[0])
    for model in no_o1_models:
        acc_list = prompt_acc_dicts[prompt_type[1:]][model]
        rows.append(acc_list)
    
    df = pd.DataFrame(rows, 
                     index=[model.split("/")[-1] for model in no_o1_models],        # 使用模型名称作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-prompts/" + dir2, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-prompts/" + dir2, file_name))


# For different models
for model in o1_like_models:
    file_name = model.split("/")[-1] + ".csv"  
    prompt_types = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    budget_list = gen_budget_list(1, dataset, model)
    
    rows = []
    for prompt_type in prompt_types:
        acc_list = model_acc_dicts[model][prompt_type]
        rows.append(acc_list)
    
    df = pd.DataFrame(rows,
                     index=prompt_types,        # 使用prompt类型作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-models/" + dir1, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-models/" + dir1, file_name))

for model in no_o1_models:
    file_name = model.split("/")[-1] + ".csv"  
    prompt_types = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    budget_list = gen_budget_list(1, dataset, model)
    
    rows = []
    for prompt_type in prompt_types:
        acc_list = model_acc_dicts[model][prompt_type]
        rows.append(acc_list)
    
    df = pd.DataFrame(rows,
                     index=prompt_types,        # 使用prompt类型作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-models/" + dir2, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-models/" + dir2, file_name))



In [191]:
# 从hard prompt中提取不同模型的最终acc
model_final_acc_dicts = {}
prompt_final_acc_dicts = {}

budget_list_no_o1 = [4096]
budget_list_o1 = [4096, 8192, 16384]

for model in model_list:
    model_final_acc_dict = {}
    if model in o1_like_models:
        budget_list = budget_list_o1
    else:
        budget_list = budget_list_no_o1
    model_prompt_list = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    for prompt_type in model_prompt_list:
        if "hard" not in prompt_type:
            continue
        else:
            acc_final_list = []
            for budget in budget_list:
                file_name = "test_" + prompt_type + something + "_b" + str(int(budget)) + "_metrics.json"
                file_path = os.path.join(file_dir, model, prompt_type, dataset, file_name)
                with open(file_path, "r") as f:
                    acc_final_list.append(json.load(f)["acc"])
            model_final_acc_dict[prompt_type] = acc_final_list
    model_final_acc_dicts[model] = model_final_acc_dict

for prompt_type in PROMP_HARD_LIST:
    prompt_final_acc_dict = {}
    for model in model_list:
        prompt_final_acc_dict[model] = model_final_acc_dicts[model][MODEL_SERIES_MAP[model] + prompt_type]
    prompt_final_acc_dicts[prompt_type[1:]] = prompt_final_acc_dict

display(model_final_acc_dicts)
display(prompt_final_acc_dicts)

{'Qwen/QwQ-32B-Preview': {'qwen-aav-hard': [95.3, 95.6, 95.2],
  'qwen-c2f-hard': [96.1, 95.4, 96.1],
  'qwen-sbs-hard': [95.9, 95.7, 95.9],
  'qwen-direct-hard': [95.5, 95.7, 95.5],
  'qwen-quick-hard': [95.5, 95.8, 95.5]},
 'NovaSky-AI/Sky-T1-32B-Preview': {'qwen-aav-hard': [94.8, 93.9, 94.7],
  'qwen-c2f-hard': [95.5, 95.1, 95.5],
  'qwen-sbs-hard': [95.8, 95.8, 95.8],
  'qwen-direct-hard': [96.1, 96.0, 96.1],
  'qwen-quick-hard': [95.8, 95.9, 95.8]},
 'Qwen/Qwen2.5-32B-Instruct': {'qwen-aav-hard': [94.5],
  'qwen-c2f-hard': [94.5],
  'qwen-sbs-hard': [96.1],
  'qwen-direct-hard': [96.1],
  'qwen-quick-hard': [95.6]},
 'Qwen/Qwen2.5-14B-Instruct': {'qwen-aav-hard': [94.2],
  'qwen-c2f-hard': [93.4],
  'qwen-sbs-hard': [94.7],
  'qwen-direct-hard': [94.4],
  'qwen-quick-hard': [88.2]},
 'Qwen/Qwen2.5-7B-Instruct': {'qwen-aav-hard': [89.0],
  'qwen-c2f-hard': [89.2],
  'qwen-sbs-hard': [91.9],
  'qwen-direct-hard': [91.7],
  'qwen-quick-hard': [79.7]},
 'Qwen/Qwen2.5-3B-Instruct': {'q

{'aav-hard': {'Qwen/QwQ-32B-Preview': [95.3, 95.6, 95.2],
  'NovaSky-AI/Sky-T1-32B-Preview': [94.8, 93.9, 94.7],
  'Qwen/Qwen2.5-32B-Instruct': [94.5],
  'Qwen/Qwen2.5-14B-Instruct': [94.2],
  'Qwen/Qwen2.5-7B-Instruct': [89.0],
  'Qwen/Qwen2.5-3B-Instruct': [81.5],
  'Qwen/Qwen2.5-1.5B-Instruct': [47.2],
  'Qwen/Qwen2.5-Math-1.5B-Instruct': [85.2],
  'Qwen/Qwen2.5-Math-7B-Instruct': [95.5],
  'mistralai/Mistral-Small-Instruct-2409': [88.9],
  'mistralai/Mistral-Nemo-Instruct-2407': [81.0],
  'mistralai/Ministral-8B-Instruct-2410': [82.0],
  'mistralai/Mathstral-7B-v0.1': [78.6],
  'google/gemma-2-27b-it': [88.2],
  'google/gemma-2-9b-it': [85.9],
  'google/gemma-2-2b-it': [53.2],
  'microsoft/phi-4': [93.9],
  'microsoft/Phi-3-medium-128k-instruct': [84.8],
  'microsoft/Phi-3-small-128k-instruct': [85.7],
  'microsoft/Phi-3-mini-128k-instruct': [76.7],
  'microsoft/Phi-3.5-mini-instruct': [82.3]},
 'c2f-hard': {'Qwen/QwQ-32B-Preview': [96.1, 95.4, 96.1],
  'NovaSky-AI/Sky-T1-32B-Previ

In [None]:
dir1 = "o1-like-models"
dir2 = "no-o1-models"
os.makedirs(os.path.join(file_dir, dataset, "different-prompts/" + dir1), exist_ok=True)
os.makedirs(os.path.join(file_dir, dataset, "different-prompts/" + dir2), exist_ok=True)
os.makedirs(os.path.join(file_dir, dataset, "different-models/" + dir1), exist_ok=True)
os.makedirs(os.path.join(file_dir, dataset, "different-models/" + dir2), exist_ok=True)


for prompt_type in PROMP_HARD_LIST:
    file_name = prompt_type[1:] + ".csv"

    # For o1-like models
    rows = []
    budget_list = gen_budget_list(1, dataset, o1_like_models[0])
    for model in o1_like_models:
        acc_list = prompt_final_acc_dicts[prompt_type[1:]][model]
        rows.append(acc_list)
    df = pd.DataFrame(rows, 
                     index=[model.split("/")[-1] for model in o1_like_models],      # 使用模型名称作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-prompts/" + dir1, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-prompts/" + dir1, file_name))
    
    # For no-o1 models
    rows = []
    budget_list = gen_budget_list(1, dataset, no_o1_models[0])
    for model in no_o1_models:
        acc_list = prompt_final_acc_dicts[prompt_type[1:]][model]
        rows.append(acc_list)
    
    df = pd.DataFrame(rows, 
                     index=[model.split("/")[-1] for model in no_o1_models],        # 使用模型名称作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-prompts/" + dir2, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-prompts/" + dir2, file_name))


# For different models
for model in o1_like_models:
    file_name = model.split("/")[-1] + ".csv"  
    prompt_types = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    budget_list = gen_budget_list(1, dataset, model)
    
    rows = []
    for prompt_type in prompt_types:
        acc_list = model_acc_dicts[model][prompt_type]
        rows.append(acc_list)
    
    df = pd.DataFrame(rows,
                     index=prompt_types,        # 使用prompt类型作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-models/" + dir1, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-models/" + dir1, file_name))

for model in no_o1_models:
    file_name = model.split("/")[-1] + ".csv"  
    prompt_types = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    budget_list = gen_budget_list(1, dataset, model)
    
    rows = []
    for prompt_type in prompt_types:
        acc_list = model_acc_dicts[model][prompt_type]
        rows.append(acc_list)
    
    df = pd.DataFrame(rows,
                     index=prompt_types,        # 使用prompt类型作为行索引
                     columns=budget_list)        # 使用budget值作为列名
    if not os.path.exists(os.path.join(file_dir, dataset, "different-models/" + dir2, file_name)):
        df.to_csv(os.path.join(file_dir, dataset, "different-models/" + dir2, file_name))

