# Throughput

In [None]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display
from ipywidgets import Layout, Button, Box
from nested_dict import nested_dict

import json
import os, glob
import pandas as pd
import pandas.io.json as pdjson
import seaborn as sns
import ipywidgets as widgets

sns.set(style="whitegrid")

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
bench_files = []
selected_files = []

artifacts_dir = "/home/guest/artifacts"
artifacts_path = artifacts_dir.split("/artifacts/")[0]

for root, dirs, files in os.walk(artifacts_dir):
    for file in files:
        if file.endswith(".orun.bench"):
            # print(root)
            f = root.split("artifacts/")[1]
            # print(f)
            if (len (f.split("/")) <= 5):
                bench_files.append((os.path.join(root, file)))

In [None]:
len(bench_files)

In [None]:
nd = nested_dict(2, list)
for x in bench_files:
    l = x.split("/artifacts/")[1]
    d = l.split("/")
    host      = d[0]
    repo      = d[1]
    commit    = d[2]
    variant   = d[3]
    timestamp = d[4]
    ocaml     = d[5]
    value      = commit + " " + variant + " " + timestamp + " " + ocaml
    nd[host][repo].append(value)
benches = nd.to_dict()    

In [None]:
def f(x):
    return x

def disp(benches):
    def select_repo(host):
        repoW.options = benches[host]
    
    def select_commit(repo):
        commitW.options = repo

    def select_variant(commit):
        return None

    hostW = widgets.Dropdown(options=benches.keys(), description='Host', disabled=False)
    hostS = hostW.value
    hostD = widgets.interactive(select_repo, host=hostW)

    repoW = widgets.Dropdown(options=benches[hostS].keys(), description='Repository', disabled=False)
    repoS = repoW.value
    repoD = widgets.interactive(select_commit, repo=repoW)

    commitW = widgets.Dropdown(options=benches[hostS][repoS], description='Commit', disabled=False)
    commitS = commitW.value
    commitD = widgets.interactive(select_variant, commit=commitW)

    items_layout = Layout( width='auto' )
    
    box_layout = Layout(display='flex',
                       flex_flow='row wrap',
                       align_items='flex-start',
                       #border='solid',
                       width='100%')
    
    items = [hostD, repoD, commitD]
    box = Box(children=items, layout=box_layout)    
    
    # display(hostD, repoD, commitD)
    display(box)
    return (hostD, repoD, commitD)

def get_filename(h, r, c):
    host = h.children[0].value
    # print(host)
    
    repos = list(benches[host].keys())
    repo= repos[r.children[0].index]
    # print(repo)
    
    entries = list(benches[host][repo])
    commit_last = entries[c.children[0].index]

    commit_list = commit_last.split(" ")
    filename = os.path.join(artifacts_path, host, repo, '/'.join(commit_list))
    return(filename)

In [None]:
comparisons = interactive(f, x=widgets.IntText(value=0, 
                                               description='Comparisons', 
                                               disabled=False))

display(comparisons)

In [None]:
print(comparisons.result)

In [None]:
matrix = [[0 for x in range(4)] for y in range(comparisons.result)]

for i in range(comparisons.result):
    matrix[i][0], matrix[i][1], matrix[i][2] = disp(benches)

In [None]:
for i in range(comparisons.result):
    matrix[i][3] = get_filename(matrix[i][0], matrix[i][1], matrix[i][2])

In [None]:
for i in range (comparisons.result):
    print(matrix[i][3])
    selected_files.append(matrix[i][3])

In [None]:
data_frames = []

for file in selected_files:
    with open(file) as f:
        data = []
        for l in f:
            data.append(json.loads(l))
        df = pdjson.json_normalize(data)
        df['variant'] = os.path.basename(file).replace(".orun.bench","")
        data_frames.append(df)

df = pd.concat (data_frames, sort=False)
df = df.sort_values(['name']) 
# Uncomment the following to display all the lines in pandas output
pd.set_option('display.max_rows', df.shape[0]+1)
df

### Drop some benchmarks


In [None]:
df = df[(df.name != 'alt-ergo.fill.why') &         #multicore version does not exist
        (df.name != 'alt-ergo.yyll.why') &         #multicore version does not exist
        (df.name != 'frama-c.slevel') &            #multicore version does not exist
        (df.name != 'js_of_ocaml.frama-c_byte')]   #multicore version does not exist
throughput_df = df
df

### Selection example

```
select * from df where variant = '4.10.0+trunk' and time_secs > 10
```

In [None]:
df.loc[(df['variant'] == '4.06.1+stock') & (df['time_secs'] > 10)]

In [None]:
df.loc[df['name'] == 'LU_decomposition.1024']

### Projection example

```
select name, variant, time_secs from df
```

In [None]:
df.filter(['name','variant','time_secs'])

## Time

In [None]:
g = sns.catplot (x='name', y='time_secs', hue='variant', data = df, kind ='bar', aspect=4)
g.set_xticklabels(rotation=90)

### Normalised

In [None]:
def normalise(df,variant,topic,additionalTopics=[]):
    df = df.sort_values(["name","variant"])
    grouped = df.filter(items=['name',topic,'variant']+additionalTopics).groupby('variant')
    ndata_frames = []
    for group in grouped:
        (v,data) = group
        if(v != variant):
            data['b'+topic] = grouped.get_group(variant)[topic].values
            data[['n'+topic]] = data[[topic]].div(grouped.get_group(variant)[topic].values, axis=0)
            for t in additionalTopics:
                print(variant, t)
                data[[t]] = grouped.get_group(variant)[t].values
            ndata_frames.append(data)
    df = pd.concat (ndata_frames)
    return df

def plot_normalised(df,variant,topic):
    df = pd.DataFrame.copy(df)
    df.sort_values(by=[topic],inplace=True)
    df[topic] = df[topic] - 1
    g = sns.catplot (x='name', y=topic, hue='variant', data = df, kind ='bar', aspect=4, bottom=1)
    g.set_xticklabels(rotation=90)
    g.ax.legend(loc=8)
    g._legend.remove()
    g.ax.set_xlabel("Benchmarks")
    return g
    # g.ax.set_yscale('log')

In [None]:
ndf = normalise(df, '4.06.1+stock', 'time_secs')
plot_normalised(ndf, '4.06.1+stock','ntime_secs')

## Top heap words

In [None]:
g = sns.catplot (x='name', y='gc.top_heap_words', hue='variant', data = df, kind ='bar', aspect=4)
g.set_xticklabels(rotation=90)

### Normalised

In [None]:
ndf = normalise(df, '4.06.1+stock', 'gc.top_heap_words')
plot_normalised(ndf, '4.06.1+stock','ngc.top_heap_words')

## Max RSS (KB)

In [None]:
g = sns.catplot (x='name', y='maxrss_kB', hue='variant', data = df, kind ='bar', aspect=4)
g.set_xticklabels(rotation=90)

### Normalised

In [None]:
ndf = normalise(df,'4.06.1+stock','maxrss_kB')
plot_normalised(ndf,'4.06.1+stock','nmaxrss_kB')

## Major Collections

In [None]:
g = sns.catplot (x='name', y='gc.major_collections', hue='variant', data = df, kind ='bar', aspect=4)
g.set_xticklabels(rotation=90)

### Normalised

In [None]:
ndf = normalise(df,'4.06.1+stock','gc.major_collections')
plot_normalised(ndf,'4.06.1+stock','ngc.major_collections')
ndf

## Major words

In [None]:
g = sns.catplot (x='name', y='gc.major_words', hue='variant', data = df, kind ='bar', aspect=4)
g.set_xticklabels(rotation=90)

### Normalised

In [None]:
ndf = normalise(df,'4.06.1+stock','gc.major_words')
plot_normalised(ndf,'4.06.1+stock','ngc.major_words')

## Minor Collections

In [None]:
g = sns.catplot (x='name', y='gc.minor_collections', hue='variant', data = df, kind ='bar', aspect=4)
g.set_xticklabels(rotation=90)

In [None]:
ndf = normalise(df,'4.06.1+stock', 'gc.minor_collections')
plot_normalised(ndf,'4.06.1+stock', 'ngc.minor_collections')

# Latency

In [None]:
data_frames = []

for file in glob.glob("*.pausetimes_*.bench"):
    with open(file) as f:
        data = []
        for l in f:
            data.append(json.loads(l))
        ldf = pdjson.json_normalize(data)
        ldf['variant'] = file.replace(".pausetimes_multicore.bench","").replace(".pausetimes_trunk.bench","")
        data_frames.append(ldf)

df2 = pd.concat(data_frames, sort=False)
df2 = df2.sort_values(['name'])

## Drop some benchmarks
df2 = df2[(df2.name != 'alt-ergo.fill.why') & #multicore version does not exist
        (df2.name != 'alt-ergo.yyll.why') & #multicore version does not exist
        (df2.name != 'frama-c.slevel') &    #multicore version does not exist
        (df2.name != 'js_of_ocaml.frama-c_byte') &    #multicore version does not exist
        (df2.name != 'cpdf.merge')]         #Not a macro benchmark. Will be removed from subsequent runs.
df2.count()
latency_df = df2
df2

The latency distributions `distr_latency` are a list of latencies at `[10,20,30,40,50,60,70,80,90,95,99,99.9]`th percentiles.

## Max latency

In [None]:
df2.filter(["name","variant","max_latency"])

In [None]:
def plotLatencyAt(df,at,aspect):
    fdf = df.filter(["name","variant",at + "_latency"])
    fdf.sort_values(by=[at + '_latency'],inplace=True)
    fdf[at + "_latency"] = fdf[at + "_latency"] / 1000.0
    g = sns.catplot (x='name', y=at+'_latency', hue='variant', data = fdf, kind ='bar', aspect=aspect)
    g.set_xticklabels(rotation=90)
    g.ax.set_ylabel(at + " latency (microseconds)")
    g.ax.set_xlabel("Benchmarks")
    g.ax.set_yscale('log')
    return g

plotLatencyAt(df2,"max",4)

## 99.9th percentile latency

In [None]:
def getLatencyAt(df,percentile,idx):
    groups = df.groupby('variant')
    ndfs = []
    for group in groups:
        (v,df) = group
        for i, row in df.iterrows():
            df.at[i,percentile+"_latency"] = list(df.at[i,"distr_latency"])[idx]
        ndfs.append(df)
    return pd.concat(ndfs)

df2 = getLatencyAt(df2,"99.9",-1)
plotLatencyAt(df2,"99.9",4)

## 99th percentile latency

In [None]:
df2 = getLatencyAt(df2,"99",-2)
plotLatencyAt(df2,"99",4)

# Export Graphs

In [None]:
import numpy as np

nameMap = {}
nameMap['bdd.26'] = 'bdd'
nameMap['binarytrees5.21'] = 'binarytrees'
nameMap['chameneos_redux_lwt.600000'] = 'chameneos_redux_lwt'
nameMap['cpdf.blacktext'] = 'cpdf.blacktext'
nameMap['cpdf.scale'] = 'cpdf.scale'
nameMap['cpdf.squeeze'] = 'cpdf.squeeze'
nameMap['durand-kerner-aberth.'] = 'durand-kerner-aberth'
nameMap['fannkuchredux.12'] = 'fannkuchredux'
nameMap['fannkuchredux2.12'] = 'fannkuchredux2'
nameMap['fasta3.25_000_000'] = 'fasta3'
nameMap['fasta6.25_000_000'] = 'fasta6'
nameMap['fft.'] = 'fft'
nameMap['game_of_life.256'] = 'game_of_life'
nameMap['kb.'] = 'kb'
nameMap['kb_no_exc.'] = 'kb_no_exc'
nameMap['knucleotide.'] = 'knucleotide'
nameMap['knucleotide3.'] = 'knucleotide3'
nameMap['levinson-durbin.'] = 'levinson-durbin'
nameMap['lexifi-g2pp.'] = 'lexifi-g2pp'
nameMap['lu-decomposition.'] = 'lu-decomposition'
nameMap['mandelbrot6.16_000'] = 'mandelbrot6'
nameMap['matrix_multiplication.1024'] = 'matrix_mult'
nameMap['menhir.ocamly'] = 'menhir.ocamly'
nameMap['menhir.sql-parser'] = 'menhir.sql-parser'
nameMap['menhir.sysver'] = 'menhir.sysver'
nameMap['minilight.roomfront'] = 'minilight.roomfront'
nameMap['naive-multilayer.'] = 'naive-multilayer'
nameMap['nbody.50_000_000'] = 'nbody'
nameMap['pidigits5.10_000'] = 'pidigits5'
nameMap['qr-decomposition.'] = 'qr-decomposition'
nameMap['quicksort.4000000'] = 'quicksort'
nameMap['regexredux2.'] = 'regexredux2'
nameMap['revcomp2.'] = 'revcomp2'
nameMap['sequence_cps.10000'] = 'sequence_cps'
nameMap['setrip.-enc_-rseed_1067894368'] = 'setrip'
nameMap['spectralnorm2.5_500'] = 'spectralnorm2'
nameMap['test_decompress.64_524_288'] = 'decompress'
nameMap['test_lwt.200'] = 'test_lwt'
nameMap['thread_ring_lwt_mvar.20_000'] = 'thread_ring_lwt_mvar'
nameMap['thread_ring_lwt_stream.20_000'] = 'thread_ring_lwt_stream'
nameMap['yojson_ydump.sample.json'] = 'yojson_ydump'
nameMap['zarith_pi.5000'] = 'zarith_pi'
nameMap['LU_decomposition.1024'] = 'lu_decomposition'
nameMap['floyd_warshall.512'] = 'floyd_warshall'

def remapNames(n):
    return nameMap[n]

def remapVariant(v):
    if (v.startswith('4.06.1+multicore+stw')):
        return 'ParMinor'
    elif (v.startswith('4.06.1+multicore')):
        return 'ConcMinor'
    else:
        return 'Stock'
    
def sanitizeLabels(df):
    df['name'] = df['name'].apply(remapNames)
    df['variant'] = df['variant'].apply(remapVariant)

def addBaselines(n,df,topic):
    if (topic == "time_secs"):
        baseline = round(float(df['b'+topic].loc[df['name'] == n].values[0]),2)
        return n + " (" + str(baseline) + ")"
    elif (topic == "gc.top_heap_words"):
        baseline = int(int(df['b'+topic].loc[df['name'] == n].values[0]) * 8 / (1024 * 1024))
        return n + " (" + str(baseline) + ")"
    elif (topic == "gc.major_collections"):
        bmajgcs = int(int(df['b'+topic].loc[df['name'] == n].values[0]))
        bmajallocsmb = int(int(df['gc.major_words'].loc[df['name'] == n].values[0]) * 8 / (1024 * 1024))
        return n + " (" + str(bmajgcs) + "," + str(bmajallocsmb) + ")"
    else:
        assert(False)
    return 


def removeDups(a):
    m = {}
    l = []
    for i in a.values:
        if not i in m.keys():
            m[i] = 0
            l.append(i)
    return l

def sortBasedOn(df1,groupby_topic,df2,index_topic):
    groups = df1.groupby(groupby_topic)
    dataframes = []
    for g in groups:
        (v,data) = g
        data = data.set_index(index_topic)
        data = data.reindex(index=removeDups(df2[index_topic]))
        data = data.reset_index()
        dataframes.append(data)
    return pd.concat(dataframes)

ndf_time = normalise(throughput_df, baseline,'time_secs')
sanitizeLabels(ndf_time)
ndf_size = normalise(throughput_df, baseline,'gc.top_heap_words')
sanitizeLabels(ndf_size)
ndf_majgc = normalise(throughput_df, baseline,'gc.major_collections',additionalTopics=['gc.major_words'])
sanitizeLabels(ndf_majgc)

ndf_time.sort_values(['ntime_secs'],inplace=True)
ndf_size = sortBasedOn(ndf_size,'variant',ndf_time,'name')
ndf_majgc = sortBasedOn(ndf_majgc,'variant',ndf_time,'name')

ndf_time['name'] = ndf_time['name'].apply(addBaselines,args=(ndf_time,'time_secs'))
ndf_size['name'] = ndf_size['name'].apply(addBaselines,args=(ndf_size,'gc.top_heap_words'))
ndf_majgc['name'] = ndf_majgc['name'].apply(addBaselines,args=(ndf_majgc,'gc.major_collections'))
ndf_majgc

In [None]:
def geo_mean(iterable):
    a = np.array(iterable)
    return a.prod()**(1.0/len(a))

for g in ndf_time.groupby('variant'):
    (v,df) = g
    print(v)
    print(geo_mean(df['ntime_secs'].values))
    
for g in ndf_size.groupby('variant'):
    (v,df) = g
    print(v)
    print(geo_mean(df['ngc.top_heap_words'].values))

for g in ndf_majgc.groupby('variant'):
    (v,df) = g
    print(geo_mean(df['ngc.major_collections'].values))

In [None]:
_df = pd.DataFrame.copy(ndf_time)
_df['ntime_secs'] = _df['ntime_secs'] - 1 # cf [bottom=1]
g = sns.catplot (x='name', y='ntime_secs', hue='variant', data = _df, 
                 kind ='bar', aspect=4, height=3, bottom=1)
g.set_xticklabels(rotation=90)
g.ax.set_ylim(0.75,1.35)
g.ax.legend(loc=2)
g._legend.remove()
g.ax.set_xlabel("Benchmarks")
g.ax.set_ylabel("Normalized Time")
g.savefig('seq_time.pdf')

In [None]:
_df = pd.DataFrame.copy(ndf_size)
_df['ngc.top_heap_words'] = _df['ngc.top_heap_words'] - 1 # cf [bottom=1]
g = sns.catplot (x='name', y='ngc.top_heap_words', hue='variant', 
                 data = _df, kind ='bar', aspect=4, bottom=1,height=3)
g.set_xticklabels(rotation=90)
g.ax.legend(loc=3)
g._legend.remove()
g.ax.set_xlabel("Benchmarks")
g.ax.set_ylabel("Normalized Max Heap Size")
g.savefig('seq_max_heap_size.pdf')

In [None]:
_df = pd.DataFrame.copy(ndf_majgc)
_df['ngc.major_collections'] = _df['ngc.major_collections'] - 1 # cf [bottom=1]
g = sns.catplot (x='name', y='ngc.major_collections', hue='variant', 
                 data = _df, kind ='bar', aspect=4, bottom=1,height=3)
g.set_xticklabels(rotation=90)
g.ax.legend(loc=1)
g._legend.remove()
g.ax.set_xlabel("Benchmarks")
g.ax.set_ylabel("Normalized Major GC Count")
g.savefig('seq_majgc_count.pdf')

In [None]:
ldf = pd.DataFrame.copy(latency_df)
ldf['name'] = ldf['name'].apply(remapNames)
ldf['variant'] = ldf['variant'].apply(remapVariant)

In [None]:
def plotLatencyAt2(df,at):
    fdf = df.filter(["name","variant",at + "_latency"])
    fdf.sort_values(by=[at + '_latency'],inplace=True)
    fdf[at + "_latency"] = fdf[at + "_latency"] / 1000.0
    g = sns.catplot (x='name', y=at+'_latency', hue='variant', data = fdf, kind ='bar', height=3, aspect=4)
    g.set_xticklabels(rotation=90)
    g.ax.set_ylabel(at + " latency (microseconds)")
    g.ax.set_xlabel("Benchmarks")
    g.ax.set_yscale('log')
    return g

g = plotLatencyAt2(ldf,"max")
g.ax.legend(loc=2)
g._legend.remove()
g.savefig('seq_max_latency.pdf')

In [None]:
ldf = getLatencyAt(ldf,"99.9",-1)
g = plotLatencyAt2(ldf,"99.9")
g.ax.legend(loc=2)
g._legend.remove()
g.savefig('seq_999_latency.pdf')

In [None]:
ldf.loc[ldf.name == 'menhir.ocamly']