In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import json
import os
from math import log10

import pandas as pd

In [3]:
runs_base = "../runs/"
hyperpar_dir = os.path.join(runs_base, "hyperparameter_search")

with open("../data/search_spaces.json", "r") as f:
    search_spaces_str = json.load(f)

In [4]:
ss = search_spaces_str["AugHC"]["learning_rate"]


def dist_to_string(search_space):
    dist_name, (a, b) = search_space
    if dist_name == "loguniform":
        # use latex 10^{} notation
        a = log10(a)
        b = log10(b)

        return f"{dist_name}($10^{{{a:.0f}}}, 10^{{{b:.0f}}}$)"
    else:
        return f"{dist_name}({a}, {b})"


dist_to_string(ss)

'loguniform($10^{-4}, 10^{-3}$)'

In [5]:
def load_configs(limit_dir):
    run_dirs = glob.glob(limit_dir + "/*")
    run_dirs = [r for r in run_dirs if r.endswith("0")]
    # load config files for the run_dirs
    configs = []
    for run_dir in run_dirs:
        config = {}
        with open(run_dir + "/config.json", "r") as f:
            config = json.load(f)
        configs.append(config)
    print(f"Loaded {len(configs)} configs")

    df = pd.DataFrame(configs)
    df.sort_values(by=["optimizer_name", "scoring_function_name"], inplace=True)
    df = df[["optimizer_name", "scoring_function_name", "optimizer_args"]]
    # drop VS rows
    df = df[df["optimizer_name"] != "VS"]
    return df


dfs = []

limit_name_dict = {
    "samples": "Samples",
    "time": "Time",
}

for limit_name in ["samples", "time"]:
    limit_dir = os.path.join(runs_base, f"best_variance_{limit_name}/")
    df_best = load_configs(limit_dir)
    df_best["limit_name"] = limit_name_dict[limit_name]
    dfs.append(df_best)
df_best = pd.concat(dfs)

Loaded 39 configs
Loaded 39 configs


In [6]:
def float_to_latex_sci_notation(number):
    """
    Converts a float to a string in scientific notation compatible with LaTeX.
    """
    if number == 0:
        return "0"

    sci_notation = f"{number:.2e}"  # Adjust the precision as needed
    base, exponent = sci_notation.split("e")
    exponent = int(exponent)  # Convert exponent to an integer

    return f"${base} \\times 10^{{{exponent}}}$"

In [7]:
arg_table_rows = []

for i, row in df_best.iterrows():
    optimizer_name = row["optimizer_name"]
    optimizer_args = row["optimizer_args"]
    task = row["scoring_function_name"]
    limit_name = row["limit_name"]
    optimizer_args = {k: v for k, v in optimizer_args.items() if k in search_spaces_str[optimizer_name]}

    for arg_name, best in optimizer_args.items():
        dist_name, (a, b) = search_spaces_str[optimizer_name][arg_name]
        if dist_name == "loguniform":
            # use latex 10^{} scientific with two significant digits for best
            # best = float_to_latex_sci_notation(best)
            best = f"\\num{{{best:.2e}}}"
        elif best == int(best):
            best = str(best)
        else:
            best = f"{best:.2f}"

        arg_table_row = {
            "Optimizer": optimizer_name,
            "Task": task,
            "Parameter": arg_name,
            "Search Space": dist_to_string(search_spaces_str[optimizer_name][arg_name]),
            "Selected": best,
            "Limit": limit_name,
        }
        arg_table_rows.append(arg_table_row)
df_raw = pd.DataFrame(arg_table_rows)
df_raw["Search Space"] = df_raw["Search Space"].str.replace("loguniform", "LogUniform")
df_raw["Search Space"] = df_raw["Search Space"].str.replace("uniform", "Uniform")
df_raw["Search Space"] = df_raw["Search Space"].str.replace("randint", "RandInt")
df_raw["Parameter"] = df_raw["Parameter"].apply(lambda x: f"\\texttt{{{x}}}")
# escape underscores
df_raw["Parameter"] = df_raw["Parameter"].str.replace("_", "\\_")
df_raw.head()

Unnamed: 0,Optimizer,Task,Parameter,Search Space,Selected,Limit
0,AugHC,DRD2,\texttt{batch\_size},"RandInt(128, 512)",482,Samples
1,AugHC,DRD2,\texttt{sigma},"Uniform(100.0, 500.0)",432.90,Samples
2,AugHC,DRD2,\texttt{topk},"Uniform(0.15, 0.35)",0.16,Samples
3,AugHC,DRD2,\texttt{learning\_rate},"LogUniform($10^{-4}, 10^{-3}$)",\num{3.55e-04},Samples
4,AugHC,GSK3,\texttt{batch\_size},"RandInt(128, 512)",305,Samples


In [8]:
# pivot dataframe such that task values are columns
df_pivot = df_raw.pivot(index=["Optimizer", "Parameter", "Search Space"], columns=["Limit", "Task"], values="Selected")

In [10]:
# convert to latex table, no lines between rows, just multirows

latex_table = df_pivot.to_latex(
    escape=False, multicolumn_format="c", multicolumn=True, multirow=True, index_names=False, column_format="lllrrrrrr"
)
# Modify the LaTeX code to include \cline for multirows
lines = latex_table.split("\n")
new_lines = []

for i, line in enumerate(lines):
    if line.startswith(r"\midrule"):
        continue
    if "cline" in line:
        continue
    if "multirow" in line or line.startswith("Stoned"):
        new_lines.append(r"\midrule")

    new_lines.append(line)


column_names_line = lines[3].split(" & ")
column_names_line[0] = r"Optimizer"
column_names_line[1] = r"Parameter"
column_names_line[2] = r"Search Space"
new_lines[3] = " & ".join(column_names_line)

new_lines.insert(3, r"\cmidrule(lr){4-6} \cmidrule(lr){7-9}")

new_lines.insert(0, r"%chktex-file 36")
# Combine modified lines
modified_latex_table = "\n".join(new_lines)

modified_latex_table = modified_latex_table.replace("GSK3", r"GSK3$\beta$")
with open("tables/hyperparameter_table.tex", "w") as f:
    f.write(modified_latex_table)