In [19]:
import pandas as pd
import glob
import os
import seaborn as sns
sns.set()

In [20]:
debug = False
folder = os.path.join("..", "..","results_all", "micro_history_full_influxdb", "commit15")
outputFile = "../../results_all/micro_history_full_influxdb.csv"

In [21]:
def load_microbench_csv(path: str) -> pd.DataFrame:
    """load csv and do some basic preprocess(some benchmark names are broken)
    """


    df: pd.DataFrame = pd.read_csv(path,header=None,
                        names=[
                            "suiteRun",
                            "baseline",
                            "path",
                            "name",
                            "version",
                            "invocations",
                            "runtime"],
                    sep=";")


    df = df[['name', 'suiteRun', 'path', 'version','runtime','invocations']]

    df.sort_values(by=["name"], inplace=True)
    df = df.assign(suiteRun=df['suiteRun'].str.extract(r"(?P<suiteRun>\d)-[\d]-[\d]", expand=False))
    df = df.assign(name=df['name'].str.strip('\"'))

    df['version'] = df['version'].apply(pd.to_numeric)
    df['suiteRun'] = df['suiteRun'].apply(pd.to_numeric)

    return df


def aggregate_data(df: pd.DataFrame) -> list:
    group: pd.DataFrame
    instanceRun = df['instanceRun'].sample().values
    errorCounter = [[0 for x in range(3)] for y in range(2)]


    aggr = []
    groupBy: pd.core.groupby.DataFrameGroupBy = df.groupby(["suiteRun", "path", "name"])
    for names, group in groupBy:
        if debug:
            print(f" Aggregating values for benchmark benchmark{names[2]} with path{names[1]} in run {names[0]}")

        df1 = group[group['version'] == 1]
        df2 = group[group['version'] == 2]
        #There must be 5 values in each dataframe (5 iterations)
        error = False
        if not len(df1) == 5:
            print(f"  There are {len(df1)} elements for version 1, "
            f"experimentRun: {instanceRun}, suiteRun: {names[0]}, commitNumber: {commitNumber}, "
            f"path: {names[1]}, name: {names[2]}")
            error = True
            errorCounter[0][int(names[0])] += 1
        if not len(df2) == 5:
            print(f"  There are {len(df2)} elements for version 2, "
            f"experimentRun: {instanceRun}, suiteRun: {names[0]}, commitNumber: {commitNumber}, "
            f"path: {names[1]}, name: {names[2]}")
            error = True
            errorCounter[1][int(names[0])] += 1
        #Find median
        if not error:
            aggr.append(group)

    print(f"  Found {len(groupBy)} benchmarks")
    print(f"  Errorcounter ([version1], [version2]): {errorCounter}")


    return aggr

In [22]:
aggr = []

all_runs = glob.glob(os.path.join(folder, "run[0-9]"))
for fullRunFolder in all_runs:
    all_numbers = glob.glob(os.path.join(fullRunFolder, "*"))
    for fullNumberFolder in all_numbers:

        all_files = glob.glob(os.path.join(fullNumberFolder, "*"))
        size = len(all_files)

        if size < 6:
            print(f"only {size} elements in folder {fullNumberFolder}, skip.")
        else:
            tmp = fullNumberFolder.split(os.sep)

            #Parse run and commit number from path structure
            instanceRun = tmp[-2][-1]
            commitNumber = tmp[-1]
            print(f"Instance Run {instanceRun} Number {commitNumber} ...")

            #Read latency file for both types (old and new)
            microbenchs = load_microbench_csv(os.path.join(fullNumberFolder, "microbenchResults.csv"))
            microbenchs['instanceRun'] = instanceRun
            microbenchs['number'] = commitNumber

            aggr.extend(aggregate_data(microbenchs))


aggr = pd.concat(aggr)


only 4 elements in folder ..\..\results_all\micro_history_full_influxdb\commit15\run1\50, skip.
Instance Run 1 Number 55 ...
  There are 0 elements for version 2, experimentRun: ['1'], suiteRun: 0, commitNumber: 55, path: /tsdb/guard_test.go/BenchmarkGuard, name: BenchmarkGuard/Measurement_Filtered/1-2
  There are 0 elements for version 2, experimentRun: ['1'], suiteRun: 0, commitNumber: 55, path: /tsdb/guard_test.go/BenchmarkGuard, name: BenchmarkGuard/Measurement_Filtered/100-2
  There are 0 elements for version 2, experimentRun: ['1'], suiteRun: 0, commitNumber: 55, path: /tsdb/guard_test.go/BenchmarkGuard, name: BenchmarkGuard/Measurement_Filtered/10000-2
  There are 0 elements for version 2, experimentRun: ['1'], suiteRun: 0, commitNumber: 55, path: /tsdb/guard_test.go/BenchmarkGuard, name: BenchmarkGuard/Tag_Filtered/1-2
  There are 0 elements for version 2, experimentRun: ['1'], suiteRun: 0, commitNumber: 55, path: /tsdb/guard_test.go/BenchmarkGuard, name: BenchmarkGuard/Tag_Fil

In [23]:
# Convert to data frame
df_detail = aggr
df_detail.sort_values(by=["name","number", "instanceRun","suiteRun"], inplace=True)
df_detail.describe()

Unnamed: 0,suiteRun,version,runtime,invocations
count,74970.0,74970.0,74970.0,74970.0
mean,1.0012,1.5,106988000.0,12162470.0
std,0.814784,0.500003,710178400.0,34693580.0
min,0.0,1.0,3.742,1.0
25%,0.0,1.0,256.95,18138.25
50%,1.0,1.5,1569.0,775147.0
75%,2.0,2.0,66146.0,4678601.0
max,2.0,2.0,17800380000.0,320464700.0


In [24]:
df_detail.to_csv(outputFile, sep=";")
