# Results Analysis

## Prepare dataset

In [100]:
# Load modules
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [101]:
# Load datasets
# experiment_times.txt
experiment_times = pd.read_csv("experiment_times_temp.csv")

In [102]:
# Extract variables from datapath
experiment_times["total_points"] = experiment_times["datapath"].apply(lambda x: x.split("_")[1])
experiment_times["dim"] = experiment_times["datapath"].apply(lambda x: x.split("_")[3])
experiment_times["distribution"] = experiment_times["datapath"].apply(lambda x: x.split("_")[5])
experiment_times["datapath"] = experiment_times["datapath"].astype(str)

In [103]:
# Check dataset
experiment_times

Unnamed: 0,datapath,cores,k,create_rdd_time,skyline_time,topk_time,skyline_topk_time,total_points,dim,distribution
0,./datasets/dataset_10000_points_4_dimension_an...,1,1,0.428020,0.887395,1.342913,0.084593,10000,4,anticorrelated
1,./datasets/dataset_10000_points_4_dimension_an...,1,10,0.024810,0.335729,1.482544,0.049712,10000,4,anticorrelated
2,./datasets/dataset_10000_points_4_dimension_an...,1,50,0.020715,0.281533,1.452980,0.058108,10000,4,anticorrelated
3,./datasets/dataset_10000_points_4_dimension_an...,1,100,0.018949,0.271884,1.409456,0.052319,10000,4,anticorrelated
4,./datasets/dataset_10000_points_4_dimension_an...,2,1,0.014521,0.219586,0.377639,0.044542,10000,4,anticorrelated
...,...,...,...,...,...,...,...,...,...,...
66,./datasets/dataset_1000000_points_10_dimension...,1,50,0.011504,165.874767,1004.872146,0.238456,1000000,10,correlated
67,./datasets/dataset_1000000_points_10_dimension...,1,100,0.013670,170.203160,1570.843498,628.440660,1000000,10,correlated
68,./datasets/dataset_1000000_points_10_dimension...,2,1,0.027075,117.844848,1832.319058,0.271713,1000000,10,correlated
69,./datasets/dataset_1000000_points_10_dimension...,2,10,0.011701,132.338863,645.907175,0.204258,1000000,10,correlated


In [104]:
experiment_times.dtypes

datapath              object
cores                  int64
k                      int64
create_rdd_time      float64
skyline_time         float64
topk_time            float64
skyline_topk_time    float64
total_points          object
dim                   object
distribution          object
dtype: object

In [105]:
# Fix data types
experiment_times["total_points"] = experiment_times["total_points"].astype(int)
experiment_times["dim"] = experiment_times["dim"].astype(int)

## Experiments

**Degrees of Freedom in Experimental design**

1. Number of cores (cores: {1,2,4,8})
2. k - topk elements to return (k: {1,10,50,100})
3. Dataset points - number of rows (total_points : {10.000, 100.000, 1.000.000})
4. Dataset dimensions - number of columns (dim : {1,4, 10, 50})
5. Distribution (distribution: {uniform, normal, correlated, anticorrelated})

In [106]:
# Total experiments
print(f"Total experiments: {experiment_times.shape[0]}")

Total experiments: 71


In [107]:
# cores
experiment_times["cores"].value_counts()

1    20
2    19
4    16
8    16
Name: cores, dtype: int64

In [108]:
# k (topk)
experiment_times["k"].value_counts()

1      18
10     18
50     18
100    17
Name: k, dtype: int64

In [109]:
# total points
experiment_times["total_points"].value_counts()

10000      64
1000000     7
Name: total_points, dtype: int64

In [110]:
# dataset dimensions
experiment_times["dim"].value_counts()

4     48
10    23
Name: dim, dtype: int64

In [111]:
# distribution
experiment_times["distribution"].value_counts()

anticorrelated    32
uniform           16
normal            16
correlated         7
Name: distribution, dtype: int64

## Times

**Times**

1. skyline_time (Task 1: skyline calculations)
2. topk_time (Task 2: topk calculations)
3. skyline_topk_time (Task 3: skyline + topk calculations)

In [112]:
experiment_times.head()

Unnamed: 0,datapath,cores,k,create_rdd_time,skyline_time,topk_time,skyline_topk_time,total_points,dim,distribution
0,./datasets/dataset_10000_points_4_dimension_an...,1,1,0.42802,0.887395,1.342913,0.084593,10000,4,anticorrelated
1,./datasets/dataset_10000_points_4_dimension_an...,1,10,0.02481,0.335729,1.482544,0.049712,10000,4,anticorrelated
2,./datasets/dataset_10000_points_4_dimension_an...,1,50,0.020715,0.281533,1.45298,0.058108,10000,4,anticorrelated
3,./datasets/dataset_10000_points_4_dimension_an...,1,100,0.018949,0.271884,1.409456,0.052319,10000,4,anticorrelated
4,./datasets/dataset_10000_points_4_dimension_an...,2,1,0.014521,0.219586,0.377639,0.044542,10000,4,anticorrelated


In [113]:
# Select time feature to study
feature_time = "skyline_time"

In [114]:
experiment_times_subset = experiment_times[(experiment_times["total_points"]==10000) &
                                                      (experiment_times["dim"]==4)]

print(f"Select {len(experiment_times_subset)} experiments")

Select 48 experiments


In [115]:
# Subset experiments
experiment_times[(experiment_times["total_points"]==10000) &
                 (experiment_times["dim"]==4) &
                 (experiment_times["distribution"] == "anticorrelated") &
                 (experiment_times["k"] == 1)  &
                 (experiment_times["cores"] == 4)
                ]


Unnamed: 0,datapath,cores,k,create_rdd_time,skyline_time,topk_time,skyline_topk_time,total_points,dim,distribution
8,./datasets/dataset_10000_points_4_dimension_an...,4,1,0.013627,0.189743,0.380768,0.04075,10000,4,anticorrelated


In [116]:
# Scatter plots

fig = px.scatter(experiment_times_subset,
                 x="k",
                 y=feature_time,
                 color="distribution",
                 facet_col="cores")

fig.update_layout(title=f"{feature_time} vs k per core | total_points=10000 and dim=4")
    
fig.show()

In [117]:
# Scatter plots

fig = px.line(experiment_times_subset,
                 x="k",
                 y=feature_time,
                 color="distribution",
                 facet_col="cores")

fig.update_layout(title=f"{feature_time} vs k per core | total_points=10000 and dim=4")
    
fig.show()

## Parallel Coordinates plot

In [118]:
# Fix datatypes for plots

distribution_map = {"uniform":1, "normal":2, "correlated":3, "anticorrelated":4}
experiment_times["distribution"] = experiment_times["distribution"].replace(distribution_map)

total_points_map = {10000:1, 100000:2, 1000000:3}
experiment_times["total_points"] = experiment_times["total_points"].replace(total_points_map).astype(int)

dim_map = {2:1, 4:2, 10:3, 50:3}
experiment_times["dim"] = experiment_times["dim"].replace(dim_map).astype(int)

In [119]:
# Check Data types
experiment_times.dtypes

datapath              object
cores                  int64
k                      int64
create_rdd_time      float64
skyline_time         float64
topk_time            float64
skyline_topk_time    float64
total_points           int64
dim                    int64
distribution           int64
dtype: object

In [120]:
# Select time feature to study
feature_time = "skyline_time"

In [121]:
## Advanced Parallel Coordinates plot

fig = go.Figure(data=
    go.Parcoords(
        line = dict(color = experiment_times[feature_time],
                   # colorscale = 'Electric',
                   showscale = True,
                   cmin = experiment_times[feature_time].min(),
                   cmax = experiment_times[feature_time].max()),
        dimensions = list([
            
            # ['distribution', 'total_points', 'cores', 'k']
            
            # Categorical Features
            dict(tickvals = list(distribution_map.values()),
                 ticktext = list(distribution_map.keys()),
                 label = 'Data distribution', values = experiment_times['distribution']),
            
            dict(tickvals = list(total_points_map.values()),
                 ticktext = list(total_points_map.keys()),
                 label = 'Total data points', values = experiment_times['total_points']),
            
            
            # Numerical Features
            dict(range = [0,10],
                 label = "Number of Cores", values = experiment_times['cores']),
            
            dict(range = [0,100],
                 label = "k (Number of points to return)", values = experiment_times['k']),
            
            dict(range = [experiment_times[feature_time].min(),experiment_times[feature_time].max()],
                 label = "Time", values = experiment_times[feature_time])
            
        
            ])
    )
)


fig.update_layout(
    title=f"Experiments: {feature_time} Calculations")
    

fig.show()