In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

pd.__version__

# Define helper functions

In [None]:
def nano_to_ms(x):
    return x/1000/1000

def ms_to_nano(x):
    return x*1000*1000

def nano_to_micro(x):
    return x/1000


def label_string(s: str):
    return f"{s[:5]}...{s[-5:]}"

def get_suffix(s: str):
    return s.split("_")[-1]

In [None]:
from scipy.signal import find_peaks

def boxplot_group(df, name):
    g = sns.FacetGrid(df, col="name", col_wrap=6, sharex=False, sharey=False)
    g.map(sns.boxplot, "time")
    g.set_titles("{col_name}")
    g.set_xlabels("Time in µs")
    g.set_ylabels("Name")
    plt.show()

def boxplot(df, name):
    sns.set(style="whitegrid")
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="name", y="time", data=df)
    plt.title(name)
    plt.xlabel("Complexity")
    plt.ylabel("Time in µs")
    plt.show()

def kdeplot(df, bw_adjust: int):
    g = sns.FacetGrid(df, col="name", col_wrap=6, sharex=False, sharey=False)
    g.map(sns.kdeplot, "time", color="b", fill=True, bw_adjust=bw_adjust)
    g.set_titles("{col_name}")
    g.set_xlabels("Time in µs")
    g.set_ylabels("Density")
    
    # find local maxima, highlight them and color the graph red
    for ax in g.axes.flat:

         # Calculate the mean of the data for this facet
        facet_data = df[df["name"] == ax.get_title()].time

        # Add a vertical line at the mean
        ax.axvline(facet_data.mean(), color="red", linestyle="--", label="Mittelwert")
        ax.axvline(facet_data.median(), color="orange", linestyle="--", label="Median")

        """for artist in ax.get_children():
            if isinstance(artist, matplotlib.collections.PolyCollection):
                kde_curve = artist.get_paths()[0].vertices
                kde_x, kde_y = kde_curve[:, 0], kde_curve[:, 1]

                peaks, _ = find_peaks(kde_y, prominence=0.001)
                #if len(peaks) > 1:
                    #artist.set_color('red')

                ax.plot(kde_x[peaks], kde_y[peaks], 'ro', markersize=8)"""
    plt.legend()
    plt.show()


def lineplot_group(df, col_wrap, height):
    g = sns.FacetGrid(df, col="name", col_wrap=col_wrap, height=height, sharex=False, sharey=False)
    g.map(sns.lineplot, "index", "time")
    g.set_axis_labels("Index of Time Entry", "Time in µs")
    g.set_titles("Line Plot for {col_name}")
    g.tight_layout()
    plt.show()

# Load csv logging data

In [None]:
experiments = map(
lambda name: pd.read_csv("logs/csv/" + name),
[
    #"2023-08-30_measuring_with_baseline_2_rename.csv", # tests from clarke
    
    # "2023-09-06_baseline.csv", # baseline with measureRepeated
     #"2023-09-06_baselineLoop.csv", # baseline with loop
    # "2023-09-06_baselineLoopMock.csv", # baseline with loop and measuring in System.nanotime()
    # "2023-09-06_methodChannelFullLoop.csv", # method channel benchmark from android side with loop (10000)
     #"2023-09-06_methodChannelFullLoop100.csv", # method channel benchmark from android side with loop (100000)
    # "2023-09-11_microbenchmarkOutput.csv", # baseline tests, but the measurements are taken from microbenchmark
    # "../../../ios/logs/csv/2023-09-18_MethodChannelFull.csv" # XCTest MethodChannelFull

      #"../../../ios/logs/csv/2023-09-18_MethodChannel_cleared.csv" # XCTest MethodChannelFull & Send without init and baseline

        # "2023-09-29_platformChannelFullWithGc.csv"

    #"2023-09-06_methodChannelFull.csv", # method channel benchmark from android side with measureRepeated
    #"2023-10-18_methodChannelFullHeavyLoad.csv", # method channel benchmark with String of 10000 As from android side with measureRepeated
    #"2023-10-18_methodChannelFullHeavyLoad2.csv", # method channel benchmark with String of 10000 As from android side with measureRepeated
    #"2023-10-18_methodChannelHeavyLoad.csv", # method channel benchmark with String and Hashmap from android side with measureRepeated, on emulator

    #"2023-10-27_heavyLoad.csv", # first implementation of actual use cases with different string lengths, with measure Block
    #"2023-10-27_heavyLoadLoop.csv", # first implementation of actual use cases with different string lengths , with loop (hier kann man sehr gut die improvements von measure sehen)
    #"2023-10-27_heavyLoadLoopx10.csv", # same as HeavyLoadLoop, but with factor 10


    #"../../../ios/logs/csv/2023-10-27_heavyLoad.csv", # XCTest heavyLoad
    "../../../ios/logs/csv/2023-10-29_heavyLoadDict.csv", # XCTest heavyLoad with dictionary (hashmap)
    #"../../../ios/logs/csv/2023-10-30_heavyLoadDictFromMeasure.csv", # XCTest heavyLoad with dictionary (hashmap),only T3_2700, but the measurements are from the measure block
    #"../../../ios/logs/csv/2023-10-30_heavyLoadDict_onlyMeasure.csv", # XCTest heavyLoad with dictionary (hashmap),only T3_2700, but the measurements are from the measure block and without the FFI measurement
   # "2023-10-12-manual-displayTimeMs.csv"
])

df = pd.concat(experiments, ignore_index=True)
df.time = df.time.map(nano_to_micro)
df.name = df.name.map(get_suffix)
df.sort_values("name")
df

In [None]:
window_size = 10  # Adjust the window size as needed
result = df.groupby('name').apply(lambda x: x.groupby(np.arange(len(x)) // 10).mean())
# Reset the index to create a DataFrame with 1000 entries
# 
result.index = result.index.droplevel(-1)
#result.reset_index( inplace=True)
#df = result

# convert from ns to µs and output basic information

In [None]:
boxplot_group(df.sort_values("name"), "Box Plots of measurements by name")
df.groupby(["name"]).describe()


# Apply tests for normality

In [None]:
#https://stackoverflow.com/a/51928888
from scipy.stats import anderson
from scipy.stats import normaltest
from scipy.stats import shapiro

for group in df.groupby("name").time.apply(lambda x: x.values):
    #print(group)
    #print(anderson(group))
    #print(normaltest(group))
    print(shapiro(group))

# Remove outliers

In [None]:
# remove outliers outside of two times the standard deviation
# based on https://stackoverflow.com/questions/46245035/pandas-dataframe-remove-outliers
# and https://stackoverflow.com/q/59806689
range = 0.01

def remove_outliers_2z(group):
    mean = group["time"].mean()
    std_dev = group["time"].std()
    return group[(group["time"]-mean).abs()<(2 * std_dev)]

def remove_outliers_iqr(group):
    q1 = group["time"].quantile(range)
    q3 = group["time"].quantile(1-range)
    iqr = q3 - q1
    lower_bound = q1 - 3 * iqr
    upper_bound = q3 + 3 * iqr
    return group[(group["time"] >= lower_bound) & (group["time"] <= upper_bound)]

def remove_outliers_iqr_rolling(group):
    q1 = group["time"].rolling(window=200).quantile(range)
    q3 = group["time"].rolling(window=200).quantile(1-range)
    iqr = q3 - q1
    lower_bound = q1 - 3 * iqr
    upper_bound = q3 + 3 * iqr
    return group[(group["time"] >= lower_bound) & (group["time"] <= upper_bound)]


# Group by "name" and apply the remove_outliers function to each group
cleaned_df = df.groupby("name").apply(remove_outliers_iqr).reset_index(drop=True)


# Create a box plot for each group and display them on the same chart
boxplot_group(cleaned_df, "Box Plots of cleaned measurements by name")
boxplot(cleaned_df, "Box Plots of cleaned measurements by complexity")
cleaned_df.groupby("name").describe()

# Apply tests for normality again

In [None]:
for measurement in cleaned_df.groupby("name").time.apply(lambda x: x.values):
    #print(measurement)
    #print(anderson(measurement))
    print(shapiro(measurement))

# visual comparison

In [None]:
kdeplot(df.sort_values("name"), 1)
kdeplot(cleaned_df.sort_values("name"), 1)    


def displot(df):
    g = sns.FacetGrid(df, col="name", col_wrap=6, sharex=False, sharey=False)
    g.map(sns.histplot, "time", kde=True)
    g.set_titles("{col_name}")
    g.set_xlabels("Time")
    g.set_ylabels("Density")
    plt.show()

#displot(cleaned_df.sort_values("name")) 

## time line of cleaned data

In [None]:
lineplot_group(cleaned_df.reset_index(), 2, 10)
