In [37]:
from io import StringIO

import numpy as np
import pandas as pd
import yaml

In [16]:
def calculate_sgm(data_points, sh=10):
    data_points = np.maximum(1, data_points + sh)
    sgm = np.exp(np.mean(np.log(data_points))) - sh
    return sgm


calculate_sgm(np.array([5, 6, 7])), calculate_sgm(np.array([3, 8, 60]))

(5.979139480908028, 15.396349734808222)

In [58]:
rawdata = """Benchmark,Size,Solver,Solver Version,Status,Runtime (s),Timeout
ben-a,1-1,sol-a,1.0,ok,5,60
ben-b,1-1,sol-a,1.0,ok,6,60
ben-c,1-1,sol-a,1.0,ok,7,60
ben-a,1-1,sol-b,1.0,ok,3,60
ben-b,1-1,sol-b,1.0,ok,8,60
ben-c,1-1,sol-b,1.0,TO,0,60
"""
test_df = pd.read_csv(StringIO(rawdata))
test_df

Unnamed: 0,Benchmark,Size,Solver,Solver Version,Status,Runtime (s),Timeout
0,ben-a,1-1,sol-a,1.0,ok,5,60
1,ben-b,1-1,sol-a,1.0,ok,6,60
2,ben-c,1-1,sol-a,1.0,ok,7,60
3,ben-a,1-1,sol-b,1.0,ok,3,60
4,ben-b,1-1,sol-b,1.0,ok,8,60
5,ben-c,1-1,sol-b,1.0,TO,0,60


In [59]:
def sgm_using_TO_values(df):
    sgm_runtime_data = []
    grouped = df.groupby(["Solver", "Solver Version"])
    num_benchs = None
    for (solver, version), group in grouped:
        # Use TO value for non-ok statuses
        runtime_values = group["Runtime (s)"].copy()
        runtime_values[group["Status"] != "ok"] = group[group["Status"] != "ok"][
            "Timeout"
        ]

        sgm_runtime = calculate_sgm(runtime_values)

        assert num_benchs is None or num_benchs == len(group)
        num_benchs = len(group)

        sgm_runtime_data.append(
            {
                "Solver": solver,
                "Version": version,
                "SGM Runtime": sgm_runtime,
            }
        )
    # Normalize SGM Runtime
    sgm_runtime_min = min(row["SGM Runtime"] for row in sgm_runtime_data)
    for row in sgm_runtime_data:
        row["Norm. SGM Runtime"] = row["SGM Runtime"] / sgm_runtime_min
    return pd.DataFrame(sgm_runtime_data).sort_values(by="SGM Runtime")


sgm_using_TO_values(test_df)

Unnamed: 0,Solver,Version,SGM Runtime,Norm. SGM Runtime
0,sol-a,1.0,5.979139,1.0
1,sol-b,1.0,15.39635,2.575011


In [None]:
def sgm_penalizing_TO_by(df, factor=5):
    sgm_runtime_data = []
    grouped = df.groupby(["Solver", "Solver Version"])
    num_benchs = None
    for (solver, version), group in grouped:
        # Use TO value for non-ok statuses
        runtime_values = group["Runtime (s)"].copy()
        runtime_values[group["Status"] != "ok"] = (
            group[group["Status"] != "ok"]["Timeout"] * factor
        )

        sgm_runtime = calculate_sgm(runtime_values)

        assert num_benchs is None or num_benchs == len(group)
        num_benchs = len(group)

        sgm_runtime_data.append(
            {
                "Solver": solver,
                "Version": version,
                "SGM Runtime": sgm_runtime,
            }
        )
    # Normalize SGM Runtime
    sgm_runtime_min = min(row["SGM Runtime"] for row in sgm_runtime_data)
    for row in sgm_runtime_data:
        row["Norm. SGM Runtime"] = row["SGM Runtime"] / sgm_runtime_min
    return pd.DataFrame(sgm_runtime_data).sort_values(by="SGM Runtime")


sgm_pen_2 = sgm_penalizing_TO_by(test_df, factor=2)
sgm_pen_4 = sgm_penalizing_TO_by(test_df, factor=4)
print(sgm_pen_2)
assert sgm_pen_2.iloc[0]["SGM Runtime"] == calculate_sgm(np.array([5, 6, 7]))
assert sgm_pen_2.iloc[1]["SGM Runtime"] == calculate_sgm(np.array([3, 8, 60 * 2]))
assert sgm_pen_4.iloc[0]["SGM Runtime"] == calculate_sgm(np.array([5, 6, 7]))
assert sgm_pen_4.iloc[1]["SGM Runtime"] == calculate_sgm(np.array([3, 8, 60 * 4]))

  Solver  Version  SGM Runtime  Norm. SGM Runtime
0  sol-a      1.0     5.979139           1.000000
1  sol-b      1.0    21.216658           3.548447


In [63]:
def sgm_on_intersection(df):
    sgm_runtime_data = []

    # First, find the benchmark-size instances solved by all solvers and filter to those
    solved_by_all = set()
    for (bench, size), group in df.groupby(["Benchmark", "Size"]):
        if (group["Status"] == "ok").all():
            solved_by_all.add(bench + "-" + size)
    df["bench-size"] = df["Benchmark"] + "-" + df["Size"]
    df = df[df["bench-size"].isin(solved_by_all)]
    print(f"Filtered to {len(solved_by_all)} benchmark instances")

    grouped = df.groupby(["Solver", "Solver Version"])
    num_benchs = None
    for (solver, version), group in grouped:
        assert (group["Status"] == "ok").all()

        sgm_runtime = calculate_sgm(group["Runtime (s)"])

        assert num_benchs is None or num_benchs == len(group)
        num_benchs = len(group)

        sgm_runtime_data.append(
            {
                "Solver": solver,
                "Version": version,
                "SGM Runtime": sgm_runtime,
            }
        )
    # Normalize SGM Runtime
    sgm_runtime_min = min(row["SGM Runtime"] for row in sgm_runtime_data)
    for row in sgm_runtime_data:
        row["Norm. SGM Runtime"] = row["SGM Runtime"] / sgm_runtime_min
    return pd.DataFrame(sgm_runtime_data).sort_values(by="SGM Runtime")


sgm_on_int = sgm_on_intersection(test_df)
assert sgm_on_int.iloc[1]["SGM Runtime"] == calculate_sgm(np.array([5, 6]))
assert sgm_on_int.iloc[0]["SGM Runtime"] == calculate_sgm(np.array([3, 8]))

Filtered to 2 benchmark instances


### Test on synthetic data

In [70]:
# Test that increasing factor doesn't change unnorm for those without TO, and increasing factor will rank those that solve more problems higher
rawdata = """Benchmark,Size,Solver,Solver Version,Status,Runtime (s),Timeout
ben-a,1-1,sol-a,1.0,ok,5,60
ben-b,1-1,sol-a,1.0,ok,6,60
ben-c,1-1,sol-a,1.0,ok,7,60
ben-a,1-1,sol-b,1.0,ok,59,60
ben-b,1-1,sol-b,1.0,ok,8,60
ben-c,1-1,sol-b,1.0,TO,0,60
ben-a,1-1,sol-c,1.0,ok,3,60
ben-b,1-1,sol-c,1.0,ER,0,60
ben-c,1-1,sol-c,1.0,TO,0,60
"""
df = pd.read_csv(StringIO(rawdata))
print(sgm_using_TO_values(df))
print(sgm_penalizing_TO_by(df, factor=5))
print(sgm_penalizing_TO_by(df, factor=10))

  Solver  Version  SGM Runtime  Norm. SGM Runtime
0  sol-a      1.0     5.979139           1.000000
2  sol-c      1.0    29.937402           5.006975
1  sol-b      1.0    34.300288           5.736660
  Solver  Version  SGM Runtime  Norm. SGM Runtime
0  sol-a      1.0     5.979139           1.000000
1  sol-b      1.0    62.749123          10.494675
2  sol-c      1.0    97.701623          16.340415
  Solver  Version  SGM Runtime  Norm. SGM Runtime
0  sol-a      1.0     5.979139           1.000000
1  sol-b      1.0    81.162692          13.574310
2  sol-c      1.0   159.122351          26.612918


### Test on only 3 L benchmarks

In commit `TODO` I modified `results/benchmark_results.csv` to only have 3 benchmarks and 3 solvers. Here is what I think the SGM numbers should be. Unfortunately, it looks like the website is showing different numbers:

In [75]:
data = pd.read_csv("../results/benchmark_results.csv")
print(sgm_using_TO_values(data))
print(sgm_penalizing_TO_by(data, factor=5))
print(sgm_penalizing_TO_by(data, factor=10))

   Solver  Version   SGM Runtime  Norm. SGM Runtime
1  gurobi   12.0.0    641.951176           1.000000
2   highs   1.10.0  12812.341733          19.958436
0     cbc  2.10.12  34752.747556          54.136123
   Solver  Version    SGM Runtime  Norm. SGM Runtime
1  gurobi   12.0.0     641.951176           1.000000
2   highs   1.10.0   21914.272134          34.136976
0     cbc  2.10.12  101621.835308         158.301502
   Solver  Version    SGM Runtime  Norm. SGM Runtime
1  gurobi   12.0.0     641.951176           1.000000
2   highs   1.10.0   27612.596210          43.013546
0     cbc  2.10.12  161317.494833         251.292467


For me, in the case "Compute SGM using TO values" the unnormalized SGM runtime for gurobi and highs are correct but cbc is incorrect. Could the bug be with the way the ER result for CBC is handled? Or the fact that the ER result has `None` in the `Runtime (s)` field?

In the case "Penalize TO by a factor of", with a factor of `5` or `10` the values of gurobi and highs are again correct, but cbc is incorrect.

### Test on all benchmarks from `prelim-results-v1` branch commit `4f6ec48`

In [71]:
# Test all L benchs -- why is cbc better than highs even with high factor?
data = pd.read_csv("../results/benchmark_results.csv")
meta = yaml.safe_load(open("../results/metadata.yaml"))

l_benchs = set()
for n, b in meta["benchmarks"].items():
    for s in b["Sizes"]:
        if s["Size"] == "L":
            l_benchs.add(n + "-" + s["Name"])

data["bench-size"] = data["Benchmark"] + "-" + data["Size"]
df = data[data["bench-size"].isin(l_benchs)]
df.shape

(60, 18)

In [66]:
len(l_benchs), len(df["bench-size"].unique())

(15, 15)

In [None]:
# TODO (Jacek): why do these values not match those on the website? In particular the SGM values for HiGHS is wrong
# My suspicion is that it has something to do with the way rows with status != "ok" are being handled
# They should also be given a runtime value of the Timeout value (multiplied by a factor in the case of "Penalizing TO by a factor of")

print(sgm_using_TO_values(df))
print(sgm_penalizing_TO_by(df, 10))
print(sgm_penalizing_TO_by(df, 100))

   Solver  Version   SGM Runtime  Norm. SGM Runtime
1  gurobi   12.0.0   1147.928707           1.000000
2   highs   1.10.0  25793.128234          22.469277
3    scip    9.2.2  31847.480540          27.743431
0     cbc  2.10.12  35747.019886          31.140453
   Solver  Version    SGM Runtime  Norm. SGM Runtime
1  gurobi   12.0.0    1147.928707           1.000000
2   highs   1.10.0  119737.555344         104.307484
3    scip    9.2.2  273166.565091         237.964748
0     cbc  2.10.12  306604.952113         267.094072
   Solver  Version   SGM Runtime  Norm. SGM Runtime
1  gurobi   12.0.0  1.147929e+03           1.000000
2   highs   1.10.0  5.557997e+05         484.176107
3    scip    9.2.2  2.342960e+06        2041.032263
0     cbc  2.10.12  2.629753e+06        2290.867449


In [74]:
df[~df["Status"].isin({"ok", "TO"})]

Unnamed: 0,Benchmark,Size,Solver,Solver Version,Solver Release Year,Status,Termination Condition,Runtime (s),Memory Usage (MB),Objective Value,Max Integrality Violation,Duality Gap,Reported Runtime (s),Timeout,Hostname,Run ID,Timestamp,bench-size
94,genx-elec_co2,15-168h,scip,9.2.2,2025.0,ER,,,2452.24,,,,,36000.0,benchmark-instance-z2-l03,20250503_040245_benchmark-instance-z2-l03,2025-05-04 02:41:33.829433,genx-elec_co2-15-168h
149,TIMES-GEO-global-netzero,31-20ts,highs,1.10.0,2025.0,warning,infeasible,55.557134,11152.216,,,,3.297731,36000.0,benchmark-instance-z-l10,20250429_202803_benchmark-instance-z-l10,2025-04-30 07:34:08.075578,TIMES-GEO-global-netzero-31-20ts
466,genx-elec_trex_co2,15-168h,scip,9.2.2,2025.0,ER,,,2481.776,,,,,36000.0,benchmark-instance-z-l01,20250429_202634_benchmark-instance-z-l01,2025-04-30 20:59:30.244298,genx-elec_trex_co2-15-168h
575,times-etimeseu-europe-elec+heat-multi_stage,29-64ts,cbc,2.10.12,2024.0,ER,,,1606.736,,,,,36000.0,benchmark-instance-z-l11,20250429_202659_benchmark-instance-z-l11,2025-04-29 20:28:11.990920,times-etimeseu-europe-elec+heat-multi_stage-29...
605,times-ireland-noco2,40-1h,cbc,2.10.12,2024.0,ER,,,2245.248,,,,,36000.0,benchmark-instance-z-l13,20250429_202636_benchmark-instance-z-l13,2025-04-29 20:28:19.450745,times-ireland-noco2-40-1h
613,genx-elec_trex_uc,15-24h,cbc,2.10.12,2024.0,ER,,,1411.336,,,,,36000.0,benchmark-instance-z-l02,20250429_202644_benchmark-instance-z-l02,2025-04-29 20:28:53.376783,genx-elec_trex_uc-15-24h
616,genx-elec_trex_uc,15-24h,scip,9.2.2,2025.0,ER,,,705.84,,,,,36000.0,benchmark-instance-z-l02,20250429_202644_benchmark-instance-z-l02,2025-04-30 06:57:10.258235,genx-elec_trex_uc-15-24h
708,times-etimeseu-europe-elec+heat-co2-multi_stage,29-64ts,cbc,2.10.12,2024.0,ER,,,1648.216,,,,,36000.0,benchmark-instance-z-l12,20250429_203556_benchmark-instance-z-l12,2025-04-29 20:38:34.015624,times-etimeseu-europe-elec+heat-co2-multi_stag...
770,TIMES-GEO-global-base,31-20ts,highs,1.10.0,2025.0,warning,infeasible,67.883251,13335.652,,,,3.879138,36000.0,benchmark-instance-z2-l09,20250503_040206_benchmark-instance-z2-l09,2025-05-03 14:39:02.781736,TIMES-GEO-global-base-31-20ts
984,times-ireland-noco2-counties,26-1h,cbc,2.10.12,2024.0,ER,,,581.368,,,,,36000.0,benchmark-instance-z-l14,20250429_202713_benchmark-instance-z-l14,2025-04-29 20:28:44.939077,times-ireland-noco2-counties-26-1h
