In [1]:
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng()

def bootstrap(perfRuntimes1: pd.DataFrame, perfRuntimes2: pd.DataFrame):
  numberOfIterations = 5
  instanceRunsNumber = 3
  instanceRuns = range(1, instanceRunsNumber)
  suiteRunsNumber = 3
  suiteRuns = range(1,suiteRunsNumber)
  numberOfSamples = 10000
  allRuntimes1 = np.ndarray((instanceRunsNumber, suiteRunsNumber, numberOfIterations))
  allRuntimes2 = np.ndarray((instanceRunsNumber, suiteRunsNumber, numberOfIterations))

  for instanceRun in instanceRuns:
    for suiteRun in suiteRuns:
        prefix = f"{instanceRun}-{suiteRun}-"
        allRuntimes1[instanceRun][suiteRun] = perfRuntimes1.loc[(perfRuntimes1['R-S-I'].str.startswith(prefix)),'sec/op'].to_numpy()
        allRuntimes2[instanceRun][suiteRun] = perfRuntimes2.loc[(perfRuntimes2['R-S-I'].str.startswith(prefix)),'sec/op'].to_numpy()

  #Generate random arrays
  currentInstanceRun = rng.choice(instanceRuns, size=(instanceRunsNumber, numberOfSamples))
  currentSuiteRun = rng.choice(suiteRuns, size=(suiteRunsNumber, instanceRunsNumber, numberOfSamples))
  currentRuntimes1 = rng.integers(numberOfIterations, size=(numberOfIterations, suiteRunsNumber, instanceRunsNumber, numberOfSamples))
  currentRuntimes2 = rng.integers(numberOfIterations, size=(numberOfIterations, suiteRunsNumber, instanceRunsNumber, numberOfSamples))

  #Bulk selection
  tmp1 = allRuntimes1[currentInstanceRun, currentSuiteRun, currentRuntimes1]
  tmp1 = np.stack(tmp1, axis=3).reshape((numberOfSamples, suiteRunsNumber * instanceRunsNumber * numberOfIterations))
  tmp2 = allRuntimes2[currentInstanceRun, currentSuiteRun, currentRuntimes2]
  tmp2 = np.stack(tmp2, axis=3).reshape((numberOfSamples, suiteRunsNumber * instanceRunsNumber * numberOfIterations))

  # Get median for both lists
  med1 = np.median(tmp1, axis=1)
  med2 = np.median(tmp2, axis=1)
  R = med2/med1
  R.sort()

  CIsmall = 1 # 99% confidence interval
  small = int((numberOfSamples * CIsmall) / 100 / 2)
  if small == 0: small = 1
  minSmall = R[small-1]
  minSmall = (minSmall - 1) * 100
  maxSmall = R[numberOfSamples-small-1]
  maxSmall = (maxSmall - 1) * 100
  instability = maxSmall - minSmall
  return minSmall, maxSmall, instability

In [2]:
def compareVersionsForBenchmark(fnName: str):
    foundBenchmark = df[df["package.BenchmarkFunction"] == fnName]
    runtimes1 = foundBenchmark[foundBenchmark["Version"] == 1]
    runtimes2 = foundBenchmark[foundBenchmark["Version"] == 2]
    assert runtimes1.shape[0] == 45
    assert runtimes2.shape[0] == 45
    median1 = runtimes1['sec/op'].median()
    median2 = runtimes2['sec/op'].median()
    change = ((median2/median1) - 1) * 100
    minci, maxci, instability = bootstrap(runtimes1, runtimes2)
    assert maxci > change
    assert change > minci
    print(f"[{fnName}] performance change: {change:.2f}% [{minci:.2f} - {maxci:.2f}] ({instability:.2f}%)")

In [3]:
df = pd.read_csv("../results.csv", names=["R-S-I", "package.BenchmarkFunction", "Version", "Directory", "Iterations", "sec/op", "B/op", "allocs/op"])

df

Unnamed: 0,R-S-I,package.BenchmarkFunction,Version,Directory,Iterations,sec/op,B/op,allocs/op
0,1-1-1,service.BenchmarkSeats,1,pkg/service/service_test.go,996,0.001143,143873,2546
1,1-1-2,service.BenchmarkSeats,1,pkg/service/service_test.go,1113,0.001178,142185,2548
2,1-1-3,service.BenchmarkSeats,1,pkg/service/service_test.go,1159,0.001039,159852,2550
3,1-1-4,service.BenchmarkSeats,1,pkg/service/service_test.go,1185,0.001015,159470,2551
4,1-1-5,service.BenchmarkSeats,1,pkg/service/service_test.go,1243,0.001057,158374,2553
...,...,...,...,...,...,...,...,...
1525,3-3-1,database.BenchmarkGetGenerics,2,pkg/database/database_test.go,206389,0.000006,1058,21
1526,3-3-2,database.BenchmarkGetGenerics,2,pkg/database/database_test.go,225082,0.000006,1061,21
1527,3-3-3,database.BenchmarkGetGenerics,2,pkg/database/database_test.go,200872,0.000006,1064,21
1528,3-3-4,database.BenchmarkGetGenerics,2,pkg/database/database_test.go,222974,0.000006,1070,21


In [4]:
for fnName in df["package.BenchmarkFunction"].unique():
    compareVersionsForBenchmark(fnName)

[service.BenchmarkSeats] performance change: -1.27% [-5.65 - 4.53] (10.18%)
[service.BenchmarkHandlerGetBookings] performance change: -0.10% [-4.02 - 4.68] (8.70%)
[service.BenchmarkHandlerGetFlightsQuery] performance change: -0.69% [-3.40 - 6.92] (10.32%)
[database.BenchmarkPut] performance change: 0.49% [-6.16 - 5.66] (11.83%)
[database.BenchmarkGetGenerics] performance change: 0.36% [-8.05 - 4.96] (13.02%)
[service.BenchmarkHandlerGetFlight] performance change: -0.73% [-4.87 - 2.49] (7.36%)
[service.BenchmarkHandlerGetFlightSeats] performance change: 0.01% [-3.49 - 3.95] (7.44%)
[service.BenchmarkHandlerCreateBooking] performance change: 1.17% [-5.14 - 4.84] (9.98%)
[service.BenchmarkFlights] performance change: -1.26% [-10.93 - 1.38] (12.31%)
[service.BenchmarkHandlerGetDestinations] performance change: -0.72% [-4.21 - 3.50] (7.71%)
[database.BenchmarkValues] performance change: 0.31% [-4.70 - 7.63] (12.33%)
[service.BenchmarkHandlerGetFlights] performance change: -0.73% [-4.48 - 4

In [5]:
compareVersionsForBenchmark("main.BenchmarkValidateOrderNumber")

[main.BenchmarkValidateOrderNumber] performance change: -90.36% [-90.46 - -90.15] (0.30%)
