In [None]:
from glob import glob
import os
import re
import pandas as pd
pd.set_option("display.max_colwidth", None)

In [None]:
data_path = "/content/drive/MyDrive/CS 159/"
assert os.path.isdir(data_path)

In [None]:
# internal representations
internal_rep_files = glob(data_path + "internal_representations/*.csv")
internal_rep_df = pd.DataFrame(
    columns=[
        "file", "created_at", "task_type", "task_size", "task_steps",
        "num_questions", "model", "prompt_type", "accuracy"
    ]
)

for file in internal_rep_files:
  created_at = os.path.getmtime(file) * (10**9)
  task_type = re.search("(?<=type-)(.*)(?=_size)", file).group(0)
  task_size = re.search("(?<=_size-)(.*)(?=_steps)", file).group(0)
  task_steps = re.search("(?<=steps-)(.*)(?=_seed)", file).group(0)
  splits = file.split("_")
  model_name = splits[-2]
  prompt_type = splits[-1].replace(".csv", "")
  this_df = pd.read_csv(file)
  num_questions = this_df.shape[0]
  accuracy = this_df["Is Correct"].astype(int).mean()
  internal_rep_df.loc[len(internal_rep_df), :] = [
      file, created_at, task_type, task_size, task_steps, num_questions,
      model_name, prompt_type, accuracy
  ]
internal_rep_df["created_at"] = pd.to_datetime(internal_rep_df["created_at"])

internal_rep_df

Unnamed: 0,file,created_at,task_type,task_size,task_steps,num_questions,model,prompt_type,accuracy
0,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama8b_internalrep-baseline.csv,2024-05-24 01:03:38,square,3,4,100,Llama8b,internalrep-baseline,0.19
1,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama70b_internalrep-baseline.csv,2024-05-24 01:11:04,square,3,4,100,Llama70b,internalrep-baseline,0.63
2,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama70b_base.csv,2024-05-24 04:14:28,square,3,4,100,Llama70b,base,0.8
3,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama70b_grid.csv,2024-05-24 04:21:47,square,3,4,100,Llama70b,grid,0.53
4,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama70b_csv.csv,2024-05-24 04:26:28,square,3,4,100,Llama70b,csv,0.56
5,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama70b_coord.csv,2024-05-24 04:35:40,square,3,4,100,Llama70b,coord,0.58
6,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama70b_colbycol.csv,2024-05-24 04:40:38,square,3,4,100,Llama70b,colbycol,0.49
7,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama8b_base.csv,2024-05-24 04:44:00,square,3,4,100,Llama8b,base,0.27
8,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama8b_grid.csv,2024-05-24 04:49:55,square,3,4,100,Llama8b,grid,0.17
9,/content/drive/MyDrive/CS 159/internal_representations/type-square_size-3_steps-4_seed-3_n-100_Llama8b_csv.csv,2024-05-24 04:54:05,square,3,4,100,Llama8b,csv,0.21


In [9]:
# other baselines
model_mappings = {
    "AnthropicClient": "Claude",
    "GPT3Client": "GPT3",
    "GPT4Client": "GPT4",
    "Llama70b": "Llama70b",
    "Llama8b": "Llama8b",
    "OpenAIClient": "`OpenAI`"
}
folders = list(model_mappings.keys())
other_results_df = pd.DataFrame(
    columns=[
        "file", "filetype", "task_type", "task_size", "task_steps",
        "num_questions", "created_at", "model", "use_code", "prompt_type",
        "accuracy"
    ]
)

for folder in folders:

  files = glob(data_path+folder+"/*.csv")
  for file in files:

    # task type stuff
    task_type = re.search("(?<=type-)(.*)(?=_size)", file).group(0)
    task_size = re.search("(?<=_size-)(.*)(?=_seed)", file).group(0)
    task_steps = None
    if "steps" in task_size:
      task_steps = task_size.split("-")[-1]
      task_size = task_size.split("_")[0]

    # results analysis
    this_df = pd.read_csv(file)
    num_questions = this_df.shape[0]
    accuracy = this_df["Is Correct"].astype(int).mean()

    # code, prompting flags
    uses_code = "no_code" not in file
    prompting = re.findall("(?<=code_)(.*$)", file)
    filetype = re.findall(".+?(?=_evaluation)", file.split("/")[-1])[0]
    if len(prompting) != 0:
      prompting = prompting[-1].replace(".csv", "")
    else:
      prompting = None

    # metadata
    created_at = os.path.getmtime(file) * (10**9)

    # add to master df
    other_results_df.loc[len(other_results_df), :] = [
        file, filetype, task_type, task_size, task_steps, num_questions,
        created_at, folder, uses_code, prompting, accuracy
    ]

# post-processing
other_results_df["model"] = other_results_df["model"].map(model_mappings)
other_results_df["created_at"] = pd.to_datetime(other_results_df["created_at"])

other_results_df.head(5)

Unnamed: 0,file,filetype,task_type,task_size,task_steps,num_questions,created_at,model,use_code,prompt_type,accuracy
0,/content/drive/MyDrive/CS 159/AnthropicClient/type-ring_size-12_steps-8_seed-12_n-100_evaluation_no_code_baseline.csv,type-ring_size-12_steps-8_seed-12_n-100,ring,12,8,100,2024-05-23 16:01:30,Claude,False,baseline,0.08
1,/content/drive/MyDrive/CS 159/AnthropicClient/type-ring_size-9_steps-4_seed-9_n-100_evaluation_no_code_baseline.csv,type-ring_size-9_steps-4_seed-9_n-100,ring,9,4,100,2024-05-23 16:04:21,Claude,False,baseline,0.13
2,/content/drive/MyDrive/CS 159/AnthropicClient/type-square_size-3_steps-4_seed-3_n-100_evaluation_no_code_baseline.csv,type-square_size-3_steps-4_seed-3_n-100,square,3,4,100,2024-05-23 16:07:05,Claude,False,baseline,0.5
3,/content/drive/MyDrive/CS 159/AnthropicClient/type-square_size-3_steps-8_seed-3_n-100_evaluation_no_code_baseline.csv,type-square_size-3_steps-8_seed-3_n-100,square,3,8,100,2024-05-23 16:09:51,Claude,False,baseline,0.31
4,/content/drive/MyDrive/CS 159/AnthropicClient/type-square_size-3_steps-8_seed-3_n-100_special_order-random_order_evaluation_no_code_baseline.csv,type-square_size-3_steps-8_seed-3_n-100_special_order-random_order,square,3,8,100,2024-05-23 16:12:55,Claude,False,baseline,0.09


In [None]:
# comparing results
compare_df = pd.DataFrame(
    other_results_df
    .groupby(["prompt_type", "model"])
    ["accuracy"]
    .mean()
)
compare_df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy
prompt_type,model,Unnamed: 2_level_1
base,GPT4,0.98
base,Llama70b,0.8
base,Llama8b,0.27
baseline,Claude,0.362143
baseline,GPT3,0.095
baseline,GPT4,0.357143
baseline,Llama70b,0.257143
baseline,Llama8b,0.146429
chain_of_thought,`OpenAI`,0.92
code_use,GPT4,0.87


In [None]:
compare_df.groupby(["prompt_type"]).mean()

Unnamed: 0_level_0,accuracy
prompt_type,Unnamed: 1_level_1
base,0.683333
baseline,0.243571
chain_of_thought,0.92
code_use,0.82
colbycol,0.335
coord,0.613333
csv,0.556667
grid,0.55
indexing,0.422257
internalrep-baseline,0.41
