In [29]:
from matplotlib import pyplot as plt
import os
import numpy as np
import json
from sklearn.linear_model import LinearRegression
import pandas as pd


file_dir = "/data03/sunyi/time_constrained_cot/outputs/2_6"
latency_path = "/home/sunyi/CoT/Time-Constrained-CoT/latency/0210"
model_list = [
    "Qwen/QwQ-32B-Preview",
    # "Skywork/Skywork-o1-Open-Llama-3.1-8B", 
    # "PowerInfer/SmallThinker-3B-Preview",
    "NovaSky-AI/Sky-T1-32B-Preview",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "Qwen/Qwen2.5-Math-1.5B-Instruct",
    "Qwen/Qwen2.5-Math-7B-Instruct",
    "mistralai/Mathstral-7B-v0.1",
    "Qwen/Qwen2.5-32B-Instruct",
    "Qwen/Qwen2.5-14B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2.5-1.5B-Instruct",
    "mistralai/Mistral-Small-Instruct-2409",
    "mistralai/Mistral-Nemo-Instruct-2407",
    "mistralai/Ministral-8B-Instruct-2410",
    "google/gemma-2-27b-it",
    "google/gemma-2-9b-it",
    "google/gemma-2-2b-it",
    "microsoft/Phi-3-medium-128k-instruct",
    "microsoft/Phi-3-small-128k-instruct",
    "microsoft/Phi-3-mini-128k-instruct",
    "microsoft/Phi-3.5-mini-instruct",
    "microsoft/phi-4",
    "meta-llama/Llama-3.1-8B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.2-1B-Instruct",
]

In [30]:
PROMP_LIST = [ "-aav", "-c2f", "-sbs"]


MODEL_SERIES_MAP = {
    "Qwen/QwQ-32B-Preview": "qwen",
    "Qwen/Qwen2.5-32B-Instruct": "qwen",
    "Qwen/Qwen2.5-14B-Instruct": "qwen",
    "Qwen/Qwen2.5-7B-Instruct": "qwen",
    "Qwen/Qwen2.5-3B-Instruct": "qwen",
    "Qwen/Qwen2.5-1.5B-Instruct": "qwen",
    "Qwen/Qwen2.5-Math-1.5B-Instruct": "qwen-math",
    "Qwen/Qwen2.5-Math-7B-Instruct": "qwen-math",
    "internlm/internlm2_5-1_8b-chat": "internlm",
    "internlm/internlm2_5-7b-chat": "internlm",
    "internlm/internlm2_5-20b-chat": "internlm",
    "google/gemma-2-2b-it": "gemma",
    "google/gemma-2-9b-it": "gemma",
    "google/gemma-2-27b-it": "gemma",
    "mistralai/Mathstral-7B-v0.1": "mistral",
    "mistralai/Ministral-8B-Instruct-2410": "mistral",
    "mistralai/Mistral-Nemo-Instruct-2407": "mistral",
    "mistralai/Mistral-Small-Instruct-2409": "mistral",
    "microsoft/phi-4": "phi4",
    "microsoft/Phi-3-medium-128k-instruct": "phi3medium",
    "microsoft/Phi-3-small-128k-instruct": "phi3small",
    "microsoft/Phi-3.5-mini-instruct": "phi3mini",
    "microsoft/Phi-3-mini-128k-instruct": "phi3mini",
    "NovaSky-AI/Sky-T1-32B-Preview": "qwen",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": "deepseek-r1-distill",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": "deepseek-r1-distill",
    "meta-llama/Llama-3.2-3B-Instruct": "llama",
    "meta-llama/Llama-3.2-1B-Instruct": "llama",
    "meta-llama/Llama-3.1-8B-Instruct": "llama"
}


MODEL_SERIES_PROMPT_TYPE_MAP = {
    "qwen": ["qwen" + prompt for prompt in PROMP_LIST],
    "qwen-math": ["qwen-math" + prompt for prompt in PROMP_LIST],
    "internlm": ["internlm"+prompt for prompt in PROMP_LIST],
    "mistral": ["mistral"+prompt for prompt in PROMP_LIST],
    "gemma": ["gemma"+prompt for prompt in PROMP_LIST],
    "phi3mini": ["phi3mini"+prompt for prompt in PROMP_LIST],
    "phi3small": ["phi3small"+prompt for prompt in PROMP_LIST],
    "phi3medium": ["phi3medium"+prompt for prompt in PROMP_LIST],
    "phi4": ["phi4"+prompt for prompt in PROMP_LIST],
    "deepseek-r1-distill": ["deepseek-r1-distill"+prompt for prompt in PROMP_LIST],
    "llama": ["llama"+prompt for prompt in PROMP_LIST]
}

In [31]:
def gen_budget_latency_list(budget, data_name, model, prompt_type):
    if budget == -1:
        return [-1]
    elif budget == 1:
        o1_like_models = [
            "Qwen/QwQ-32B-Preview", 
            "Skywork/Skywork-o1-Open-Llama-3.1-8B", 
            "PowerInfer/SmallThinker-3B-Preview",
            "NovaSky-AI/Sky-T1-32B-Preview", 
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
        ]
        if model in o1_like_models: # maybe should extend to longer sequence
            if data_name == "gsm8k":
                budget_list = []
                for i in range(25, 300, 25):
                    budget_list.append(i)
                for i in range(300, 600, 50):
                    budget_list.append(i)
                for i in range(600, 1201, 100):
                    budget_list.append(i)
            elif data_name in ["math", "math500"]:
                budget_list = []
                for i in range(25, 600, 25):
                    budget_list.append(i)
                for i in range(600, 2401, 100):
                    budget_list.append(i)
        else:    
            if data_name == "gsm8k":
                budget_list = []
                for i in range(25, 300, 25):
                    budget_list.append(i)
                for i in range(300, 601, 50):
                    budget_list.append(i)
            elif data_name in ["math", "math500"]:
                budget_list = []
                for i in range(25, 300, 25):
                    budget_list.append(i)
                for i in range(300, 600, 50):
                    budget_list.append(i)
                for i in range(600, 1201, 100):
                    budget_list.append(i)
    
        if "hard" not in prompt_type:
            budget_list = [budget + 25 for budget in budget_list]
        
        tokens = np.array([64, 128, 256, 512, 1024])
        file = model.split("/")[-1] + "_a800.csv"
        df = pd.read_csv(os.path.join(latency_path, file))
        latencies = df.iloc[1, 1:].values  # Extract latency values for the first row
        # Perform linear regression on the measured latency data
        linear_model = LinearRegression()
        tokens = tokens.reshape(-1, 1)
        linear_model.fit(tokens, latencies)
        print(f"File: {file} - Linear Fit: Slope = {linear_model.coef_[0]:.2f}, Intercept = {linear_model.intercept_:.2f}")
        print(f"File: {file} - R^2 Score: {linear_model.score(tokens, latencies):.2f}")
        # change budget_list into latency_list using the linear regression model
        budget_list = np.array(budget_list)
        budget_list = budget_list.reshape(-1, 1)
        latency_list = linear_model.predict(budget_list)
        budget_list = budget_list.reshape(1, -1)[0]
        latency_list = latency_list.reshape(1, -1)[0]

        if "hard" not in prompt_type:
            budget_list = [budget - 25 for budget in budget_list]
            
        return budget_list, latency_list

In [32]:
dataset = "gsm8k"
something = "_-1_seed0_t0.0_s0_e-1"

In [33]:
model_acc_dicts = {}
model_latency_dicts = {}

for model in model_list:
    model_acc_dict = {}
    model_latency_dict = {}

    model_prompt_list = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
    for prompt_type in model_prompt_list:
        budget_list, latency_list = gen_budget_latency_list(1, dataset, model, prompt_type)
        acc_list = []
        for budget in budget_list:
            file_name = "test_" + prompt_type + something + "_b" + str(int(budget)) + "_metrics.json"
            file_path = os.path.join(file_dir, model, prompt_type, dataset, file_name)
            with open(file_path, "r") as f:
                acc_list.append(json.load(f)["acc"])
        model_acc_dict[prompt_type] = acc_list
        model_latency_dict[prompt_type] = latency_list
    model_acc_dicts[model] = model_acc_dict
    model_latency_dicts[model] = model_latency_dict

import pandas as pd

# Create list to store flattened data
data = []

# Iterate through models
for model in model_acc_dicts.keys():
    # Get model's accuracy and latency dicts
    acc_dict = model_acc_dicts[model]
    latency_dict = model_latency_dicts[model]
    
    # Iterate through prompt types
    for prompt_type in acc_dict.keys():
        acc_list = acc_dict[prompt_type]
        latency_list = latency_dict[prompt_type]
        
        # Create rows for each budget point
        for idx, (acc, latency) in enumerate(zip(acc_list, latency_list)):
            data.append({
                'Model': model.split('/')[-1],
                'Prompt_Type': prompt_type,
                'Budget_Index': idx,
                'Latency': latency,
                'Accuracy': acc
            })

# Convert to DataFrame and save
df = pd.DataFrame(data)
df.to_csv('model_performance_data.csv', index=False)

File: QwQ-32B-Preview_a800.csv - Linear Fit: Slope = 0.05, Intercept = -0.29
File: QwQ-32B-Preview_a800.csv - R^2 Score: 1.00
File: QwQ-32B-Preview_a800.csv - Linear Fit: Slope = 0.05, Intercept = -0.29
File: QwQ-32B-Preview_a800.csv - R^2 Score: 1.00
File: QwQ-32B-Preview_a800.csv - Linear Fit: Slope = 0.05, Intercept = -0.29
File: QwQ-32B-Preview_a800.csv - R^2 Score: 1.00
File: Sky-T1-32B-Preview_a800.csv - Linear Fit: Slope = 0.05, Intercept = -0.30
File: Sky-T1-32B-Preview_a800.csv - R^2 Score: 1.00
File: Sky-T1-32B-Preview_a800.csv - Linear Fit: Slope = 0.05, Intercept = -0.30
File: Sky-T1-32B-Preview_a800.csv - R^2 Score: 1.00
File: Sky-T1-32B-Preview_a800.csv - Linear Fit: Slope = 0.05, Intercept = -0.30
File: Sky-T1-32B-Preview_a800.csv - R^2 Score: 1.00
File: DeepSeek-R1-Distill-Qwen-1.5B_a800.csv - Linear Fit: Slope = 0.02, Intercept = 0.06
File: DeepSeek-R1-Distill-Qwen-1.5B_a800.csv - R^2 Score: 1.00
File: DeepSeek-R1-Distill-Qwen-1.5B_a800.csv - Linear Fit: Slope = 0.02, 

In [34]:
# dataset = "math500"
# something = "_-1_seed0_t0.0_s0_e-1"

In [35]:
# model_acc_dicts = {}
# model_latency_dicts = {}

# for model in model_list:
#     model_acc_dict = {}

#     model_prompt_list = MODEL_SERIES_PROMPT_TYPE_MAP[MODEL_SERIES_MAP[model]]
#     for prompt_type in model_prompt_list:
#         budget_list, latency_list = gen_budget_latency_list(1, dataset, model, prompt_type)
#         acc_list = []
#         for budget in budget_list:
#             file_name = "test_" + prompt_type + something + "_b" + str(int(budget)) + "_metrics.json"
#             file_path = os.path.join(file_dir, model, prompt_type, dataset, file_name)
#             with open(file_path, "r") as f:
#                 acc_list.append(json.load(f)["acc"])
#         model_acc_dict[prompt_type] = acc_list
#     model_acc_dicts[model] = model_acc_dict
#     model_latency_dicts[model] = latency_list

# model_latency_dicts