In [1]:
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng()

def bootstrap(perfRuntimes1: pd.DataFrame, perfRuntimes2: pd.DataFrame):
  numberOfIterations = 5
  instanceRunsNumber = 3
  instanceRuns = range(1, instanceRunsNumber)
  suiteRunsNumber = 3
  suiteRuns = range(1,suiteRunsNumber)
  numberOfSamples = 10000
  allRuntimes1 = np.ndarray((instanceRunsNumber, suiteRunsNumber, numberOfIterations))
  allRuntimes2 = np.ndarray((instanceRunsNumber, suiteRunsNumber, numberOfIterations))

  for instanceRun in instanceRuns:
    for suiteRun in suiteRuns:
        prefix = f"{instanceRun}-{suiteRun}-"
        allRuntimes1[instanceRun][suiteRun] = perfRuntimes1.loc[(perfRuntimes1['R-S-I'].str.startswith(prefix)),'sec/op'].to_numpy()
        allRuntimes2[instanceRun][suiteRun] = perfRuntimes2.loc[(perfRuntimes2['R-S-I'].str.startswith(prefix)),'sec/op'].to_numpy()

  #Generate random arrays
  currentInstanceRun = rng.choice(instanceRuns, size=(instanceRunsNumber, numberOfSamples))
  currentSuiteRun = rng.choice(suiteRuns, size=(suiteRunsNumber, instanceRunsNumber, numberOfSamples))
  currentRuntimes1 = rng.integers(numberOfIterations, size=(numberOfIterations, suiteRunsNumber, instanceRunsNumber, numberOfSamples))
  currentRuntimes2 = rng.integers(numberOfIterations, size=(numberOfIterations, suiteRunsNumber, instanceRunsNumber, numberOfSamples))

  #Bulk selection
  tmp1 = allRuntimes1[currentInstanceRun, currentSuiteRun, currentRuntimes1]
  tmp1 = np.stack(tmp1, axis=3).reshape((numberOfSamples, suiteRunsNumber * instanceRunsNumber * numberOfIterations))
  tmp2 = allRuntimes2[currentInstanceRun, currentSuiteRun, currentRuntimes2]
  tmp2 = np.stack(tmp2, axis=3).reshape((numberOfSamples, suiteRunsNumber * instanceRunsNumber * numberOfIterations))

  # Get median for both lists
  med1 = np.median(tmp1, axis=1)
  med2 = np.median(tmp2, axis=1)
  R = med2/med1
  R.sort()

  CIsmall = 1 # 99% confidence interval
  small = int((numberOfSamples * CIsmall) / 100 / 2)
  if small == 0: small = 1
  minSmall = R[small-1]
  minSmall = (minSmall - 1) * 100
  maxSmall = R[numberOfSamples-small-1]
  maxSmall = (maxSmall - 1) * 100
  instability = maxSmall - minSmall
  return minSmall, maxSmall, instability

In [2]:
def compareVersionsForBenchmark(fnName: str):
    foundBenchmark = df[df["package.BenchmarkFunction"] == fnName]
    runtimes1 = foundBenchmark[foundBenchmark["Version"] == 1]
    runtimes2 = foundBenchmark[foundBenchmark["Version"] == 2]
    assert runtimes1.shape[0] == 45
    assert runtimes2.shape[0] == 45
    median1 = runtimes1['sec/op'].median()
    median2 = runtimes2['sec/op'].median()
    change = ((median2/median1) - 1) * 100
    minci, maxci, instability = bootstrap(runtimes1, runtimes2)
    assert maxci > change
    assert change > minci
    print(f"[{fnName}] performance change: {change:.2f}% [{minci:.2f} - {maxci:.2f}] ({instability:.2f}%)")

In [3]:
df = pd.read_csv("../results.csv", names=["R-S-I", "package.BenchmarkFunction", "Version", "Directory", "Iterations", "sec/op", "B/op", "allocs/op"])

df

Unnamed: 0,R-S-I,package.BenchmarkFunction,Version,Directory,Iterations,sec/op,B/op,allocs/op
0,1-1-1,service.BenchmarkHandlerCreateBooking,2,pkg/service/service_bookings_test.go,76980,1.511800e-05,5936,124
1,1-1-2,service.BenchmarkHandlerCreateBooking,2,pkg/service/service_bookings_test.go,79185,1.542100e-05,5937,124
2,1-1-3,service.BenchmarkHandlerCreateBooking,2,pkg/service/service_bookings_test.go,81578,1.542900e-05,6245,124
3,1-1-4,service.BenchmarkHandlerCreateBooking,2,pkg/service/service_bookings_test.go,82442,1.516100e-05,6245,124
4,1-1-5,service.BenchmarkHandlerCreateBooking,2,pkg/service/service_bookings_test.go,81948,1.546200e-05,6259,124
...,...,...,...,...,...,...,...,...
895,3-3-1,service.BenchmarkHandlerGetFlight,1,pkg/service/service_flights_test.go,1000000,1.036000e-06,1587,17
896,3-3-2,service.BenchmarkHandlerGetFlight,1,pkg/service/service_flights_test.go,1221454,9.818000e-07,1844,17
897,3-3-3,service.BenchmarkHandlerGetFlight,1,pkg/service/service_flights_test.go,1222510,9.872000e-07,1844,17
898,3-3-4,service.BenchmarkHandlerGetFlight,1,pkg/service/service_flights_test.go,1220502,1.074000e-06,1846,17


In [4]:
for fnName in df["package.BenchmarkFunction"].unique():
    compareVersionsForBenchmark(fnName)

[service.BenchmarkHandlerCreateBooking] performance change: 0.26% [-1.94 - 1.28] (3.22%)
[service.BenchmarkHandlerGetDestinations] performance change: 0.03% [-2.38 - 0.32] (2.70%)
[service.BenchmarkHandlerGetFlight] performance change: 0.30% [-1.17 - 6.17] (7.34%)
[service.BenchmarkHandlerGetFlightsQuery] performance change: 0.14% [-2.37 - 0.58] (2.94%)
[service.BenchmarkFlights] performance change: 1629.59% [1584.41 - 1647.99] (63.59%)
[service.BenchmarkHandlerGetFlights] performance change: -0.89% [-4.59 - 1.37] (5.96%)
[service.BenchmarkFlight] performance change: 56317.82% [54848.88 - 57014.92] (2166.05%)
[service.BenchmarkSeats] performance change: 406.96% [393.32 - 411.45] (18.12%)
[service.BenchmarkHandlerGetFlightSeats] performance change: 0.32% [-0.45 - 2.08] (2.52%)
[service.BenchmarkHandlerGetBookings] performance change: 0.36% [-1.71 - 1.80] (3.51%)


In [5]:
compareVersionsForBenchmark("main.BenchmarkValidateOrderNumber")

[main.BenchmarkValidateOrderNumber] performance change: -90.36% [-90.46 - -90.15] (0.30%)
