In [1]:
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng()

def bootstrap(perfRuntimes1: pd.DataFrame, perfRuntimes2: pd.DataFrame):
  numberOfIterations = 5
  instanceRunsNumber = 3
  instanceRuns = range(1, instanceRunsNumber)
  suiteRunsNumber = 3
  suiteRuns = range(1,suiteRunsNumber)
  numberOfSamples = 10000
  allRuntimes1 = np.ndarray((instanceRunsNumber, suiteRunsNumber, numberOfIterations))
  allRuntimes2 = np.ndarray((instanceRunsNumber, suiteRunsNumber, numberOfIterations))

  for instanceRun in instanceRuns:
    for suiteRun in suiteRuns:
        prefix = f"{instanceRun}-{suiteRun}-"
        allRuntimes1[instanceRun][suiteRun] = perfRuntimes1.loc[(perfRuntimes1['R-S-I'].str.startswith(prefix)),'sec/op'].to_numpy()
        allRuntimes2[instanceRun][suiteRun] = perfRuntimes2.loc[(perfRuntimes2['R-S-I'].str.startswith(prefix)),'sec/op'].to_numpy()

  #Generate random arrays
  currentInstanceRun = rng.choice(instanceRuns, size=(instanceRunsNumber, numberOfSamples))
  currentSuiteRun = rng.choice(suiteRuns, size=(suiteRunsNumber, instanceRunsNumber, numberOfSamples))
  currentRuntimes1 = rng.integers(numberOfIterations, size=(numberOfIterations, suiteRunsNumber, instanceRunsNumber, numberOfSamples))
  currentRuntimes2 = rng.integers(numberOfIterations, size=(numberOfIterations, suiteRunsNumber, instanceRunsNumber, numberOfSamples))

  #Bulk selection
  tmp1 = allRuntimes1[currentInstanceRun, currentSuiteRun, currentRuntimes1]
  tmp1 = np.stack(tmp1, axis=3).reshape((numberOfSamples, suiteRunsNumber * instanceRunsNumber * numberOfIterations))
  tmp2 = allRuntimes2[currentInstanceRun, currentSuiteRun, currentRuntimes2]
  tmp2 = np.stack(tmp2, axis=3).reshape((numberOfSamples, suiteRunsNumber * instanceRunsNumber * numberOfIterations))

  # Get median for both lists
  med1 = np.median(tmp1, axis=1)
  med2 = np.median(tmp2, axis=1)
  R = med2/med1
  R.sort()

  CIsmall = 1 # 99% confidence interval
  small = int((numberOfSamples * CIsmall) / 100 / 2)
  if small == 0: small = 1
  minSmall = R[small-1]
  minSmall = (minSmall - 1) * 100
  maxSmall = R[numberOfSamples-small-1]
  maxSmall = (maxSmall - 1) * 100
  instability = maxSmall - minSmall
  return minSmall, maxSmall, instability

In [2]:
def compareVersionsForBenchmark(fnName: str):
    foundBenchmark = df[df["package.BenchmarkFunction"] == fnName]
    runtimes1 = foundBenchmark[foundBenchmark["Version"] == 1]
    runtimes2 = foundBenchmark[foundBenchmark["Version"] == 2]
    assert runtimes1.shape[0] == 45
    assert runtimes2.shape[0] == 45
    median1 = runtimes1['sec/op'].median()
    median2 = runtimes2['sec/op'].median()
    change = ((median2/median1) - 1) * 100
    minci, maxci, instability = bootstrap(runtimes1, runtimes2)
    assert maxci > change
    assert change > minci
    print(f"[{fnName}] performance change: {change:.2f}% [{minci:.2f} - {maxci:.2f}] ({instability:.2f}%)")

In [19]:
columnNames = ["R-S-I", "package.BenchmarkFunction", "Version", "Directory", "Iterations", "sec/op", "B/op", "allocs/op"]

#df = pd.read_csv("../results/fbs/mb-main-perf-issue-clean-path-2022-08-23T18:23:34+02:00.csv", names=columnNames)
df = pd.read_csv("../results/fbs/mb-main-perf-issue-request-id-2022-08-23T19:44:15+02:00.csv", names=columnNames)
functionNames = df["package.BenchmarkFunction"].unique()
functionNames.sort()

for fnName in functionNames:
    compareVersionsForBenchmark(fnName)

[database.BenchmarkGet] performance change: -1.49% [-2.73 - 2.16] (4.89%)
[database.BenchmarkGetGenerics] performance change: -0.55% [-4.03 - 3.58] (7.62%)
[database.BenchmarkPut] performance change: 0.15% [-3.73 - 2.76] (6.49%)
[database.BenchmarkRawGet] performance change: -0.11% [-4.00 - 1.13] (5.13%)
[database.BenchmarkRawValues] performance change: 0.07% [-0.40 - 3.99] (4.39%)
[database.BenchmarkValues] performance change: -0.03% [-1.16 - 1.26] (2.42%)
[database.BenchmarkValuesGenerics] performance change: 0.28% [-1.92 - 1.64] (3.56%)
[service.BenchmarkHandlerCreateBooking] performance change: 0.84% [-1.61 - 2.85] (4.46%)
[service.BenchmarkHandlerGetBookings] performance change: -0.21% [-1.65 - 1.43] (3.08%)
[service.BenchmarkHandlerGetDestinations] performance change: -0.10% [-2.28 - 1.30] (3.58%)
[service.BenchmarkHandlerGetFlight] performance change: 0.88% [-2.28 - 8.15] (10.43%)
[service.BenchmarkHandlerGetFlightSeats] performance change: -0.76% [-2.30 - 2.58] (4.88%)
[service