# RQ4 - Can we measure input sensitivity?

To help researchers detecting the problematic, we suggest to compute a metric based on our experiments. This metric is designed to quantify the level of input sensitivity related to the performance property of a software system. 
We first need to choose a threshold $\alpha$, representing the maximal proportion of variability due to inputs we can tolerate. 
For instance, if we consider that \pc{5} of lost performance can be accepted, $\alpha$ will be fixed at $0.05$. 
Then, we define this score of Input Sensitivity as follows:
$IS = \frac{1}{4}*|C_{max} - C_{min}| + \frac{1}{2\alpha}*\min(\frac{Q_{3}}{Q_{1}}-1,\alpha)$
where
$C_{min}$ and $C_{max}$ are the minimal and maximal Spearman correlations
$Q_{1}$ and $Q_{3}$ are resp the first and third quartiles of the performance distribution.

The first part of the expression quantifies how the input sensitivity change the rankings of configurations ($RQ_{1}$ and $RQ_{2}$), and the second part the actual impact of input sensitivity ($RQ_{3}$) in the actual performance. 
Both vary from $0$ to $0.5$, which includes $IS$ between $0$ (no input sensitivity) and $1$ (high input sensitivity). 

In the evaluation, we compute $IS$ for each couple of systems and performance properties of our dataset, with $\alpha$ arbitrarily fixed at 10%.


#### First, we import some libraries

In [1]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
# high-level plots
import seaborn as sns

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# Decision Tree
from sklearn.tree import DecisionTreeRegressor, plot_tree
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# get interactions of features
from sklearn.preprocessing import PolynomialFeatures


# we use it to interact with the file system
import os
# compute time
from time import time

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# no warning
import warnings
warnings.filterwarnings("ignore")

### Import data

In [None]:
name_systems = ["nodejs", "poppler", "xz", "x264", "gcc", "lingeling", "imagemagick", "sqlite"]

inputs_perf = dict()

inputs_perf["gcc"] = ["size", "ctime", "exec"]
inputs_perf["imagemagick"] = ["time"]
inputs_perf["lingeling"] = ["conflicts", "cps", "reductions"]
inputs_perf["nodejs"] = ["ops"]
inputs_perf["poppler"] = ["size", "time"]
inputs_perf["sqlite"] = ["q"+str(i+1) for i in range(15)]
inputs_perf["x264"] = ["size", "kbs", "fps", "etime", "cpu"]
inputs_perf["xz"] = ["size", "time"]


q2 = dict()

q2["gcc","ctime"]=1.09
q2["gcc","exec"]=1.07
q2["gcc","size"]=1.01
q2["imagemagick","time"]=1.02
q2["imagemagick", "size"]=1.00
q2["lingeling","conflicts"]=1.05
q2["lingeling","cps"]=1.06
q2["lingeling","reductions"]=1.04
q2["nodejs","ops"]=1.08
q2["poppler","size"]=1.0
q2["poppler","time"]=1.15
q2["sqlite","q1"]=1.02
q2["sqlite","q10"]=1.02
q2["sqlite","q11"]=1.02
q2["sqlite","q12"]=1.04
q2["sqlite","q13"]=1.02
q2["sqlite","q14"]=1.03
q2["sqlite","q15"]=1.03
q2["sqlite","q2"]=1.03
q2["sqlite","q3"]=1.01
q2["sqlite","q4"]=1.03
q2["sqlite","q5"]=1.02
q2["sqlite","q6"]=1.03
q2["sqlite","q7"]=1.01
q2["sqlite","q8"]=1.03
q2["sqlite","q9"]=1.02
q2["x264","cpu"]=1.05
q2["x264","etime"]=1.03
q2["x264","fps"]=1.04
q2["x264","kbs"]=1.12
q2["x264","size"]=1.12
q2["xz","size"]=1.0
q2["xz","time"]=1.03

cmin = dict()

cmin["gcc","ctime"]=0.72
cmin["gcc","exec"]=-0.69
cmin["gcc","size"]=0.48
cmin["imagemagick","time"]=-0.24
cmin["imagemagick", "size"]=0.01
cmin["lingeling","conflicts"]=-0.90
cmin["lingeling","cps"]=-0.89
cmin["lingeling","reductions"]=-0.99
cmin["nodejs","ops"]=-0.87
cmin["poppler","size"]=-1.00
cmin["poppler","time"]=-0.94
cmin["sqlite","q1"]= -0.78
cmin["sqlite","q2"]=-0.58
cmin["sqlite","q3"]=-0.78
cmin["sqlite","q4"]=-0.77
cmin["sqlite","q5"]=-0.80
cmin["sqlite","q6"]=-0.80
cmin["sqlite","q7"]=-0.71
cmin["sqlite","q8"]=0.03
cmin["sqlite","q9"]=
cmin["sqlite","q10"]=
cmin["sqlite","q11"]=
cmin["sqlite","q12"]=
cmin["sqlite","q13"]=
cmin["sqlite","q14"]=
cmin["sqlite","q15"]=
cmin["x264","cpu"]=
cmin["x264","etime"]=
cmin["x264","fps"]=
cmin["x264","kbs"]=
cmin["x264","size"]=
cmin["xz","size"]=
cmin["xz","time"]=

cmax = dict()

cmax["gcc","ctime"]=0.97
cmax["gcc","exec"]=1.00
cmax["gcc","size"]=1.00
cmax["imagemagick","time"]=1.00
cmax["imagemagick", "size"]=1.00
cmax["lingeling","conflicts"]=0.92
cmax["lingeling","cps"]=0.93
cmax["lingeling","reductions"]=1.00
cmax["nodejs","ops"]=0.95
cmax["poppler","size"]=1.00
cmax["poppler","time"]=1.00
cmax["sqlite","q1"]= 0.87
cmax["sqlite","q2"]=0.94
cmax["sqlite","q3"]=0.84
cmax["sqlite","q4"]=0.84
cmax["sqlite","q5"]=0.81
cmax["sqlite","q6"]=0.86
cmax["sqlite","q7"]=0.92
cmax["sqlite","q8"]=0.96
cmax["sqlite","q9"]=
cmax["sqlite","q10"]=
cmax["sqlite","q11"]=
cmax["sqlite","q12"]=
cmax["sqlite","q13"]=
cmax["sqlite","q14"]=
cmax["sqlite","q15"]=
cmax["x264","cpu"]=
cmax["x264","etime"]=
cmax["x264","fps"]=
cmax["x264","kbs"]=
cmax["x264","size"]=
cmax["xz","size"]=
cmax["xz","time"]=

# RQ3 code and results

# Partial results (i.e. the table of the article)

## Compute the performance ratios.

In [3]:
def get_ratios(ns, perf):
    
    ratios = []
    
    nb_inputs = inputs_count[ns]
    
    for index_target in range(nb_inputs):

        list_ratios = []
        s1 = np.max(data[ns, index_target][perf])

        for i in range(10):
            index_source = np.random.randint(nb_inputs)
            s2 = data[ns, index_target][perf][np.argmax(data[ns, index_source][perf])]
            # we drop the ratios that are division per 0 or nan values
            if not np.isnan(s1) and not np.isnan(s2) and s2!=0:
                # we drop the ratios too high because it is just due to the fact that s2 is too low 
                # and it increases the standard deviation
                if int(s1/s2) <= 50:
                    list_ratios.append(s1/s2)

        ratios.append(np.nanmean(list_ratios))

    return (np.nanmean(ratios), 
            np.nanstd(ratios), 
            np.nanpercentile(ratios,5),
            np.nanpercentile(ratios,25),
            np.nanmedian(ratios),
            np.nanpercentile(ratios,75),
            np.nanpercentile(ratios,95))

## Compute the table of ratios

In [4]:
fontsize = "\\footnotesize "
fontsize_number = ""

perfs = []
for ns in sorted(name_systems):
    for perf in sorted(inputs_perf[ns]):
        perfs.append(perf[0:5])

print("\\begin{table*}")
print("""\\caption{Performance ratio distributions across inputs, 
      for different software systems and different performance properties. 
      In lines, \\textit{Avg} the avegrae performance ratio. 
      \\textit{Std} the standard deviation. 
      \\textit{$5^{th}$} the $5^{th}$ percentile.
      \\textit{Q1} the first quartile.
      \\textit{Q2} the median.
      \\textit{Q3} the third quartile.
      \\textit{$95^{th}$} the $95^{th}$ percentile.
      Due to space constraints, we arbitrarly select few performance properties.}""")
print("\\label{tab:ratios}")
print("\\vspace*{-0.4cm}")
print("\\begin{tabular}{|"+"c|"*(len(perfs)+1)+"}")
print("\hline")
print(fontsize_number+"\\textbf{\\textit{System}}")
for ns in sorted(name_systems):
    print(" & \\multicolumn{"+str(len(inputs_perf[ns]))+"}{|c|}{"+fontsize_number+
          "\\cellcolor[HTML]{e8e8e8}{\\textbf{\\textit{"+ns+"}}}}")
print(" \\tabularnewline \\hline")

print(fontsize_number+"Perf. P")
for p in perfs:
    print(" & "+fontsize+p)
print(" \\tabularnewline \\hline")

ratio = dict()
for ns in sorted(name_systems):
    for perf in sorted(inputs_perf[ns]):
        numbers = [np.round(k,2) for k in get_ratios(ns, perf)]
        for i in range(len(numbers)):
            ratio[ns, perf, i] = numbers[i] 

header = ["Avg", "Std", "$5^{th}$", "Q1", "Q2", "Q3", "$95^{th}$"]

for i in range(len(header)):
    #if i >=1:
    print(fontsize_number+header[i])
    for ns in sorted(name_systems):
        for perf in inputs_perf[ns]:
            print(" & "+fontsize_number+str(ratio[ns, perf, i]))
    #else:
    #    for ns in sorted(name_systems):
    #        for perf in inputs_perf[ns]:
    #            print(" & "+str(ratio[ns, perf, 0])+" $\pm$ "+str(ratio[ns, perf, 1]))
    print(" \\tabularnewline \\hline")

print("\\end{tabular}")
print("\\vspace*{-0.3cm}")
print("\\end{table*}")

\begin{table*}
\caption{Performance ratio distributions across inputs, 
      for different software systems and different performance properties. 
      In lines, \textit{Avg} the avegrae performance ratio. 
      \textit{Std} the standard deviation. 
      \textit{$5^{th}$} the $5^{th}$ percentile.
      \textit{Q1} the first quartile.
      \textit{Q2} the median.
      \textit{Q3} the third quartile.
      \textit{$95^{th}$} the $95^{th}$ percentile.
      Due to space constraints, we arbitrarly select few performance properties.}
\label{tab:ratios}
\vspace*{-0.4cm}
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|}
\hline
\textbf{\textit{System}}
 & \multicolumn{3}{|c|}{\cellcolor[HTML]{e8e8e8}{\textbf{\textit{gcc}}}}
 & \multicolumn{2}{|c|}{\cellcolor[HTML]{e8e8e8}{\textbf{\textit{lingeling}}}}
 & \multicolumn{1}{|c|}{\cellcolor[HTML]{e8e8e8}{\textbf{\textit{nodejs}}}}
 & \multicolumn{2}{|c|}{\cellcolor[HTML]{e8e8e8}{\textbf{\textit{poppler}}}}
 & \multicolumn{3}{|c|}{\cell

In [18]:
np.mean([1.08, 1.14, 1.24, 2.09, 1.37, 1.76, 1.6, 2.69, 
         1.04, 1.08, 1.07, 1.43, 1.44, 1.1, 1.11, 1.11, 1.0, 1.08])

1.357222222222222

# Complete results 

In [15]:
data_dir = "../../../data/"
name_systems = ["nodejs", "poppler", "xz", "x264", "gcc", "lingeling", "imagemagick"]

data = dict()
inputs_name = dict()
inputs_count = dict()
inputs_perf = dict()

inputs_perf["gcc"] = ["size", "ctime", "exec"]
inputs_perf["imagemagick"] = ["time"]
inputs_perf["lingeling"] = ["conflicts", "cps", "reductions"]
inputs_perf["nodejs"] = ["ops"]
inputs_perf["poppler"] = ["size", "time"]
inputs_perf["sqlite"] = ["q"+str(i+1) for i in range(15)]
inputs_perf["x264"] = ["size", "kbs", "fps", "etime", "cpu"]
inputs_perf["xz"] = ["size", "time"]


inputs_feat = dict()

inputs_feat["gcc"] = ["optim","-floop-interchange","-fprefetch-loop-arrays","-ffloat-store","-fno-asm"]
inputs_feat["imagemagick"] = ["memory_r", "posterize_r", "gaussian-blur", "thread", "quality"]
inputs_feat["lingeling"] = ["--boost", "--carduse", "--decompose", "--gluescale", "--lkhd", "--memlim", 
"--minimize", "--prbsimple", "--sweepirr", "--sweepred"]
inputs_feat["nodejs"] = ["--jitless", "--experimental-wasm-modules", "--experimental-vm-modules",
                         "--preserve-symlinks-main","--no-warnings","--node-memory-debug"]
inputs_feat["poppler"] = ["format","j","jp2","jbig2","ccitt"]
inputs_feat["sqlite"] = ["-deserialize", "-memtrace", "-maxsize", "-append", "-output"]
inputs_feat["x264"] = ["cabac", "ref", "deblock", "analyse", "me", "subme", "mixed_ref", "me_range", "trellis", 
                "8x8dct", "fast_pskip", "chroma_qp_offset", "bframes", "b_pyramid", "b_adapt", "direct", 
                "weightb", "open_gop", "weightp", "scenecut", "rc_lookahead", "mbtree", "qpmax", "aq-mode"]
inputs_feat["xz"] = ["memory","format","level","depth"]


inputs_categ = dict()

inputs_categ["gcc"] = ["optim"]
inputs_categ["imagemagick"] = []
inputs_categ["lingeling"] = []
inputs_categ["nodejs"] = []
inputs_categ["poppler"] = ["format"]
inputs_categ["sqlite"] = []
inputs_categ["x264"] = ['analyse', 'me', 'direct', 'deblock']
inputs_categ["xz"] = ['memory', 'format']

for ns in name_systems:
    
    data_path = data_dir+ns+'/'
    
    inputs = sorted(os.listdir(data_path))
    inputs.remove('others')

    inputs_name[ns] = inputs
    inputs_count[ns] = len(inputs)
    
    for i in range(len(inputs)):
        loc = data_path+inputs[i]
        data[ns, i] = pd.read_csv(loc)

In [16]:
complete_res = []
header = ["Avg", "Std", "$5^{th}$", "Q1", "Q2", "Q3", "$95^{th}$"]
ratio = dict()
for ns in sorted(name_systems):
    print("Results for",ns)
    for perf in sorted(inputs_perf[ns]):
        print("Results", perf)
        numbers = [np.round(k,2) for k in get_ratios(ns, perf)]
        complete_res.append(np.mean(numbers))
        disp=""
        for i in range(len(numbers)):
            disp+=" "+header[i]+" "+str(numbers[i]) 
        print(disp)

Results for gcc
Results ctime
 Avg 1.12 Std 0.05 $5^{th}$ 1.04 Q1 1.1 Q2 1.11 Q3 1.14 $95^{th}$ 1.21
Results exec
 Avg 1.46 Std 0.76 $5^{th}$ 1.01 Q1 1.17 Q2 1.27 Q3 1.42 $95^{th}$ 2.11
Results size
 Avg 1.08 Std 0.07 $5^{th}$ 1.0 Q1 1.02 Q2 1.04 Q3 1.11 $95^{th}$ 1.21
Results for imagemagick
Results time
 Avg 1.05 Std 0.04 $5^{th}$ 1.01 Q1 1.02 Q2 1.03 Q3 1.05 $95^{th}$ 1.12
Results for lingeling
Results conflicts
 Avg 2.04 Std 2.55 $5^{th}$ 1.02 Q1 1.06 Q2 1.15 Q3 1.54 $95^{th}$ 6.7
Results cps
 Avg 1.75 Std 1.72 $5^{th}$ 1.02 Q1 1.06 Q2 1.15 Q3 1.48 $95^{th}$ 4.34
Results reductions
 Avg 1.38 Std 0.77 $5^{th}$ 1.0 Q1 1.04 Q2 1.11 Q3 1.29 $95^{th}$ 2.87
Results for nodejs
Results ops
 Avg 1.7 Std 1.64 $5^{th}$ 1.01 Q1 1.09 Q2 1.17 Q3 1.55 $95^{th}$ 4.09
Results for poppler
Results size
 Avg 1.58 Std 1.27 $5^{th}$ 1.0 Q1 1.0 Q2 1.08 Q3 1.54 $95^{th}$ 3.91
Results time
 Avg 2.64 Std 3.47 $5^{th}$ 1.02 Q1 1.14 Q2 1.37 Q3 2.13 $95^{th}$ 9.71
Results for x264
Results cpu
 Avg 1.1 Std 0.14

In [17]:
print(np.mean(complete_res))

1.3836974789915966
