In [1]:
import pandas as pd
import glob
import numpy as np
from numpy.random import default_rng
import seaborn as sns
sns.set()
rng = default_rng()



In [2]:
# Analyzes regression in app benchmark and prints relative differences in appRegression.csv

start = 0.05
end = 0.2

numberOfSamples = 10000
CIsmall = 1
CImed = 5
CIlarge = 10

folder = "../../results_all/app_history_influxdb/"
filename = "../../results_aggr/app_history_influxdb.csv"
print(folder)

../../results_all/app_history_influxdb/


In [3]:
def resample(perfRuntimes1: pd.DataFrame,
            perfRuntimes2: pd.DataFrame,
            instanceRuns: np.ndarray,
            samples: int,
            numberOfSamples: int) -> np.ndarray:
    """
        Resamples performances using hierarchical bootstrapping for building confindence intervals

        Builds a tensor of random indices of a form numberOfSamples * instanceRunsNumber * suiteRunsNumber * numberOfIterations.
        Then uses these indices to choose from a performance runtimes tensor with a form instanceRunsNumber * suiteRunsNumber * numberOfIterations.
        Finally, reshapes resulting tensor to a matrix of form numberOfSamples * ( instanceRunsNumber * suiteRunsNumber * numberOfIterations)
        and calculates performance differences between medians.

        Parameters
        ----------
        perfRuntimes1 : performance runtimes of the first version.
        perfRuntimes2 : performance runtimes of the second version.
        instanceRuns : array of instanceRun numbers
        samples : number of drawn alues

        Returns
        -------
        np.ndarray
            array of performance differences with a shape numberOfSample * 1.

        """
    instanceRunsNumber = instanceRuns.shape[0]

    allRuntimes1 = np.ndarray((instanceRunsNumber, samples))
    allRuntimes2 = np.ndarray((instanceRunsNumber, samples))


    for instanceRun in instanceRuns:
        allRuntimes1[instanceRun - 1]= perfRuntimes1.loc[(perfRuntimes1['run'] == instanceRun),
                                                'latency (ms)'].to_numpy()
        allRuntimes2[instanceRun - 1]= perfRuntimes2.loc[(perfRuntimes2['run'] == instanceRun),
                                                'latency (ms)'].to_numpy()

    medians = []
    for i in range(numberOfSamples):
        if (i % 1000 == 0):
            print(f"i is {i}")
        #Generate Random Arrays
        currentInstanceRun = rng.choice(instanceRuns, size=(instanceRunsNumber)) - 1
        currentRuntimes1 = rng.integers(samples, size=(samples, instanceRunsNumber))
        currentRuntimes2 = rng.integers(samples, size=(samples, instanceRunsNumber))
        #Bulk selection
        tmp1 = allRuntimes1[currentInstanceRun, currentRuntimes1]
        tmp1 = tmp1.reshape((instanceRunsNumber * samples))

        tmp2 = allRuntimes1[currentInstanceRun, currentRuntimes2]
        tmp2 = tmp2.reshape((instanceRunsNumber * samples))

        # Get median for both lists
        med1 = np.median(tmp1, axis=0)
        med2 = np.median(tmp2, axis=0)
        medians.append(med2/med1)

    return medians

def bootstrap(perfRuntimes1: pd.DataFrame,
            perfRuntimes2: pd.DataFrame):

    instanceRuns = perfRuntimes1.run.unique()
    numberOfInstanceRuns = len(instanceRuns)

    R = resample(perfRuntimes1=perfRuntimes1,
                    perfRuntimes2=perfRuntimes2,
                    instanceRuns=instanceRuns,
                    samples=int(len(perfRuntimes1) / numberOfInstanceRuns),
                    numberOfSamples=numberOfSamples)

    print(f"    Bootstrapping done ({len(R)} elements in R).")

    # Find conf. intervals
    R.sort()

    small = int((numberOfSamples * CIsmall) / 100 / 2)
    if small == 0:
        small  = 1
    medium = int((numberOfSamples * CImed) / 100 / 2)
    large = int((numberOfSamples * CIlarge) / 100 / 2)

    minSmall = R[small-1]
    minSmall = (minSmall - 1) * 100
    maxSmall = R[numberOfSamples-small-1]
    maxSmall = (maxSmall - 1) * 100

    minMedium = R[medium-1]
    minMedium = (minMedium - 1) * 100
    maxMedium = R[numberOfSamples-medium-1]
    maxMedium = (maxMedium - 1) * 100

    minLarge = R[large-1]
    minLarge = (minLarge - 1) * 100
    maxLarge = R[numberOfSamples-large-1]
    maxLarge = (maxLarge - 1) * 100

    return minSmall, maxSmall


rows = []
all_runs = glob.glob(folder + "*")
for fullRunFolder in all_runs:
    runFolder = fullRunFolder + "/"
    all_numbers = glob.glob(runFolder + "*")
    for fullNumberFolder in all_numbers:

        all_files = glob.glob(fullNumberFolder + "/*")
        size = len(all_files)

        if size != 17:
            print(f"{size} elements in folder {fullNumberFolder}, skip.")
        else:
            tmp = fullNumberFolder.split("\\")

            #Parse run and commit number from path structure
            run = tmp[len(tmp)-2]
            number = tmp[len(tmp)-1]
            print(f"Run {run} Number {number}")

            #Read latency file for both types (old and new)
            insertsOld = pd.read_csv(fullNumberFolder + "/latenciesInsertsOld.csv",
                             header=None,
                             names=["worker","batch","latency (ms)"])
            insertsOld.sort_values(by=["batch"], inplace=True)
            n = len(insertsOld.index)
            removeFirst = int(start * n)
            removeLast = int(end * n)
            insertsOld = insertsOld.iloc[removeFirst:-removeLast , :]

            insertsNew = pd.read_csv(fullNumberFolder + "/latenciesInsertsNew.csv",
                             header=None,
                             names=["worker","batch","latency (ms)"])
            insertsNew.sort_values(by=["batch"], inplace=True)
            n = len(insertsNew.index)
            removeFirst = int(start * n)
            removeLast = int(end * n)
            insertsNew = insertsNew.iloc[removeFirst:-removeLast , :]

            queries1Old = pd.read_csv(fullNumberFolder + "/latenciesQueries1Old.csv",
                             header=None,
                             names=["query","latency (ms)"])
            queries1Old.sort_values(by=["query"], inplace=True)
            n = len(queries1Old.index)
            removeFirst = int(start * n)
            removeLast = int(end * n)
            queries1Old = queries1Old.iloc[removeFirst:-removeLast , :]

            queries2Old = pd.read_csv(fullNumberFolder + "/latenciesQueries2Old.csv",
                             header=None,
                             names=["query","latency (ms)"])
            queries2Old.sort_values(by=["query"], inplace=True)
            n = len(queries2Old.index)
            removeFirst = int(start * n)
            removeLast = int(end * n)
            queries2Old = queries2Old.iloc[removeFirst:-removeLast , :]

            queries1New = pd.read_csv(fullNumberFolder + "/latenciesQueries1New.csv",
                             header=None,
                             names=["query","latency (ms)"])
            queries1New.sort_values(by=["query"], inplace=True)
            n = len(queries1New.index)
            removeFirst = int(start * n)
            removeLast = int(end * n)
            queries1New = queries1New.iloc[removeFirst:-removeLast , :]

            queries2New = pd.read_csv(fullNumberFolder + "/latenciesQueries2New.csv",
                             header=None,
                             names=["query","latency (ms)"])
            queries2New.sort_values(by=["query"], inplace=True)
            n = len(queries2New.index)
            removeFirst = int(start * n)
            removeLast = int(end * n)
            queries2New = queries2New.iloc[removeFirst:-removeLast , :]


            # Run Bootstrapping
            insertsCI = bootstrap(insertsOld, insertsNew)
            queries1CI = bootstrap(queries1Old, queries1New)
            queries2CI = bootstrap(queries2Old, queries2New)




            #Find median latency for both types
            medianInsertsOld = insertsOld["latency (ms)"].median()
            medianInsertsNew = insertsNew["latency (ms)"].median()

            medianQueries1Old = queries1Old["latency (ms)"].median()
            medianQueries1New = queries1New["latency (ms)"].median()

            medianQueries2Old = queries2Old["latency (ms)"].median()
            medianQueries2New = queries2New["latency (ms)"].median()

            #Calculate diff
            diffInserts = ((medianInsertsNew / medianInsertsOld) - 1) * 100
            diffQueries1 = ((medianQueries1New / medianQueries1Old) - 1) * 100
            diffQueries2 = ((medianQueries2New / medianQueries2Old) - 1) * 100

            #Append to rows
            rows.append({
                "run" : run,
                "number" : int(number),
                "diffInserts": diffInserts,
                "diffQueries1": diffQueries1,
                "diffQueries2": diffQueries2
            })

Run run1 Number 1
Run run1 Number 10
Run run1 Number 100
Run run1 Number 101
Run run1 Number 102
Run run1 Number 103
Run run1 Number 104
Run run1 Number 105
Run run1 Number 106
Run run1 Number 107
Run run1 Number 108
Run run1 Number 109
Run run1 Number 11
Run run1 Number 110
Run run1 Number 111
Run run1 Number 12
Run run1 Number 13
Run run1 Number 14
Run run1 Number 15
Run run1 Number 16
Run run1 Number 17
Run run1 Number 18
Run run1 Number 19
Run run1 Number 2
Run run1 Number 20
Run run1 Number 21
Run run1 Number 22
Run run1 Number 23
Run run1 Number 24
Run run1 Number 25
Run run1 Number 26
Run run1 Number 27
Run run1 Number 28
Run run1 Number 29
Run run1 Number 3
Run run1 Number 30
Run run1 Number 31
Run run1 Number 32
Run run1 Number 33
Run run1 Number 34
Run run1 Number 35
Run run1 Number 36
Run run1 Number 37
Run run1 Number 38
Run run1 Number 39
Run run1 Number 4
Run run1 Number 40
Run run1 Number 41
Run run1 Number 42
Run run1 Number 43
Run run1 Number 44
Run run1 Number 45
Run 

In [4]:
# Convert to data frame
df_result = pd.DataFrame(rows)
df_result.sort_values(by=["number"], inplace=True)
df_result.describe()


Unnamed: 0,number,diffInserts,diffQueries1,diffQueries2
count,314.0,314.0,314.0,314.0
mean,54.127389,8.371055,-6.948724,-1.018325
std,31.745841,7.407,7.702315,4.919227
min,1.0,-1.179349,-29.828682,-18.593207
25%,27.0,0.226043,-8.124201,-0.492457
50%,53.0,13.981282,-6.555832,0.349746
75%,79.0,14.980691,-1.29237,1.186581
max,111.0,21.653403,2.834186,3.037608


In [5]:
df_result.to_csv(filename)