## n projections

In [1]:
import subprocess
import re
import pandas as pd

# 1) Define your command and capture its full output
cmd = [
    "../bazel-bin/examples/train_oblique_forest",
    "--input_mode=synthetic",
    "--max_num_projections=10",
    "--num_trees=1",
    "--num_threads=1",
    "--tree_depth=2",
    "--rows=524288",
    "--cols=1024",
]
result = subprocess.run(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,  # merge stderr into stdout
    text=True,                 # get a Python str instead of bytes
    check=True,
)
log_output = result.stdout

# 2) Extract all SortFeature and ScanSplits times from the full log
sort_times = re.findall(r"SortFeature took:\s*([\d\.eE+-]+s)", log_output)
scan_times = re.findall(r"ScanSplits took:\s*([\d\.eE+-]+s)", log_output)

# 3) Build a DataFrame, one row per projection
df = pd.DataFrame({
    "projection_id": range(1, len(sort_times) + 1),
    "SortFeature": sort_times,
    "ScanSplits": scan_times
})

# 4) Use it however you like:
print(df)

# (Optional) save to CSV
# df.to_csv("projection_timings.csv", index=False)


   projection_id SortFeature   ScanSplits
0              1   0.052177s  0.00987493s
1              2  0.0532176s  0.00992014s
2              3  0.0528513s  0.00987339s
3              4  0.0527124s  0.00984357s
4              5  0.0523294s  0.00991106s
5              6  0.0519505s  0.00984887s
6              7  0.0523428s  0.00989952s
7              8  0.0519321s  0.00985933s
8              9  0.0525285s  0.00991317s
9             10  0.0523465s  0.00988332s


## 1-projection

In [5]:
import subprocess

rows = 524288
# rows = 524288

# cols = 1024
cols = 4096

cmd = [
    "../bazel-bin/examples/train_oblique_forest",
    "--input_mode=synthetic",
    "--max_num_projections=1",
    "--num_trees=20",
    "--num_threads=1",
    "--tree_depth=2",
    f"--rows={rows}",
    f"--cols={cols}",
]

# Capture both stdout and stderr combined:
result = subprocess.run(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,          # gives you a str instead of bytes
    check=True,         # raises CalledProcessError on non-zero exit
)

log_output = result.stdout

In [6]:
# Get Training Time, to add to filename
import re

m = re.search(r"Training wall-time:\s*([\d\.]+s)", log_output)
if m:
    training_time = m.group(1)  # e.g. "2.71642s"
else:
    raise LookupError("Training Time string couldn't be found")

print(training_time)

2.70682s


In [7]:
file_dir = "../ariel_results/per_function_timing/"

import os, re, pandas as pd

TIMING_RE = re.compile(
    r"^\s*(?:-\s*)*"                 # leading “- ” blocks
    r"(?P<name>[^:]+?)\s+"           # function name
    r"(?:Took|took):\s+"             # “took:” (any case)
    r"(?P<secs>[0-9.eE+-]+)s",
    re.IGNORECASE,
)

def _parse(log: str) -> pd.DataFrame:
    rows, cur_tree = [], -1
    for line in log.splitlines():
        if "Selecting Bootstrapped Samples Took" in line:
            cur_tree += 1
        m = TIMING_RE.match(line)
        if m and cur_tree >= 0:
            rows.append(
                {
                    "tree": cur_tree + 1,
                    "function": m.group("name").strip(),
                    "time_s": float(m.group("secs")),
                }
            )
    return pd.DataFrame(rows)

# ── desired column order & renames ────────────────────────────────
ORDER = [
    "Selecting Bootstrapped Samples",
    "Initialization of FindBestCondOblique",
    "SampleProjection",
    "ApplyProjection",
    "Bucket Allocation & Initialization=0",
    "Filling & Finalizing the Buckets",
    "SortFeature",
    "ScanSplits",
    "Post-processing after Training all Trees",
    "EvaluateProjection",
    "FillExampleBucketSet (next 3 calls)",
]
RENAMES = {
    "Post-processing after Train": "Post-processing after Training all Trees",
    "FillExampleBucketSet (calls 3 above)": "FillExampleBucketSet (next 3 calls)",
}

def tree_table(log: str) -> pd.DataFrame:
    df = _parse(log)
    long = df.groupby(["tree", "function"], as_index=False)["time_s"].sum()

    wide = (
        long.pivot(index="tree", columns="function", values="time_s")
            .rename(columns=RENAMES)          # make names match ORDER list
            .fillna(0.0)
    )

    # ensure every requested column exists, then apply the order
    wide = wide.reindex(columns=ORDER, fill_value=0.0)

    # make “tree” a visible column instead of the index
    wide = wide.reset_index()

    return wide

In [9]:
tbl = tree_table(log_output)

tbl.head(3)

function,tree,Selecting Bootstrapped Samples,Initialization of FindBestCondOblique,SampleProjection,ApplyProjection,Bucket Allocation & Initialization=0,Filling & Finalizing the Buckets,SortFeature,ScanSplits,Post-processing after Training all Trees,EvaluateProjection,FillExampleBucketSet (next 3 calls)
0,1,0.057083,0.003213,1.5e-05,0.004156,0.002205,3.6e-08,0.055822,0.010408,0.0,0.069582,0.058764
1,2,0.057229,0.002362,4e-06,0.003567,0.001898,3.4e-08,0.053892,0.009857,0.0,0.066622,0.056452
2,3,0.054531,0.002704,6e-06,0.003788,0.001932,3.5e-08,0.053437,0.010074,0.0,0.066394,0.056023


In [None]:
# Save to file
dir_path = os.path.join(file_dir, f"{rows}_x_{cols}")

if not os.path.exists(dir_path):
    os.makedirs(dir_path)

full_path = os.path.join(dir_path, f"{training_time}.xlsx")
print("Saving to", full_path)

tbl.to_excel(full_path, index=False)

Saving to ../ariel_results/per_function_timing/524288_x_4096/2.70682s.xlsx
