In [103]:
import pandas as pd   
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os
import shutil
from datetime import datetime
from pathlib import Path
from tabulate import tabulate
from statistics import mean


# Notebook for preprocessing data from experiments

Defined functions:

In [104]:
def get_subdirectories(directory=""):
    subdirectories = []
    p = Path("./../experiments_data/" + directory)
    for item in p.glob('*/'):
        if item.suffix not in (['.csv', '.zip']):
            subdirectories.append(directory + "/" + item.name)
    return subdirectories

def get_duration(dataframe):
    start_time = dataframe['timestamp'].max()
    stop_time = dataframe['timestamp'].min()

    start_datetime = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S.%f')
    stop_datetime = datetime.strptime(stop_time, '%Y-%m-%d %H:%M:%S.%f')
    duration = stop_datetime - start_datetime
    return duration.microseconds

def preprocess_file(file_path, save_path, filename, iteration):
    read_data = pd.read_csv(file_path + "/" + filename)
    agg_read_data = read_data.\
        groupby('timestamp', as_index=False).\
        agg({"CPU": "sum", "RAM": "sum"})

    generate_plot(agg_read_data.index, agg_read_data.CPU, "{0}/CPU/{1}_CPU.png".format(save_path, iteration))
    generate_plot(agg_read_data.index, agg_read_data.RAM, "{0}/RAM/{1}_RAM.png".format(save_path, iteration))

    duration = get_duration(agg_read_data)

    return agg_read_data, duration

def generate_plot(data_x, data_y, plot_path, title=""):
    fig = plt.figure(figsize=(20,10))
    fig.patch.set_facecolor('white')
    plt.plot(data_x, data_y, 'r.-')
    plt.title(title)
    plt.savefig(plot_path)
    plt.close(fig)

def create_directory(path):
    try:
        os.mkdir(path)
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))

Check for subdirectories. Number of subdirectories should much the number of nodes used for experiments.

In [105]:
all_directories = get_subdirectories()
nodes_directories = [x for x in all_directories if "node" in x]
nodes_directories

['/node_11',
 '/node_12',
 '/node_13',
 '/node_14',
 '/node_15',
 '/node_16',
 '/node_17',
 '/node_18',
 '/node_19',
 '/node_20']

In [106]:
data_directories = []
data_directories_groups = []
for directory in nodes_directories:
    cur_node_subdirectories = get_subdirectories(directory)
    data_directories.append(cur_node_subdirectories)

data_directories_groups = data_directories
data_directories = [item for sublist in data_directories for item in sublist]
data_directories_groups

[['/node_11/filterCatalogSalesWhereProfitNegative',
  '/node_11/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
  '/node_11/filterCatalogSalesWhereYearAfter2000',
  '/node_11/minWholeSaleCostGroupedBySoldDate'],
 ['/node_12/filterCatalogSalesWhereProfitNegative',
  '/node_12/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
  '/node_12/filterCatalogSalesWhereYearAfter2000',
  '/node_12/minWholeSaleCostGroupedBySoldDate'],
 ['/node_13/filterCatalogSalesWhereProfitNegative',
  '/node_13/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
  '/node_13/filterCatalogSalesWhereYearAfter2000',
  '/node_13/minWholeSaleCostGroupedBySoldDate'],
 ['/node_14/filterCatalogSalesWhereProfitNegative',
  '/node_14/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
  '/node_14/filterCatalogSalesWhereYearAfter2000',
  '/node_14/minWholeSaleCostGroupedBySoldDate'],
 ['/node_15/filterCatalogSalesWhereProfitNegative',
  '/node_15/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',


Check for function names in each node directory.
These names should match the names in `experiments-plan.csv`.


In [107]:
function_names = data_directories_groups[1]
function_names = list(map(lambda x: x[8:], data_directories_groups[1]))
function_names

['/filterCatalogSalesWhereProfitNegative',
 '/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
 '/filterCatalogSalesWhereYearAfter2000',
 '/minWholeSaleCostGroupedBySoldDate']

### Aggregating data from all nodes (11-19)

In [None]:
create_directory("./../experiments_data/preprocessed-data")

In [None]:
mean_dir = "./../experiments_data/preprocessed-data/workers-mean-data"
create_directory(mean_dir)

for function_name in function_names[0:1]:
    base_data = pd.DataFrame()
    experiment_mean_dir = "{0}{1}".format(mean_dir, function_name)

    create_directory(experiment_mean_dir)

    for node_dir in nodes_directories[:-1]:
        file_path = "./../experiments_data{0}{1}".format(node_dir, function_name)
        p = Path(file_path)
        plots_path = "{0}/plots".format(file_path)
        create_directory(plots_path)
        create_directory(plots_path + "/RAM")
        create_directory(plots_path + "/CPU")
        experiment_number = 1
        for file in p.glob('*.csv'):
            new_data, duration = preprocess_file(file_path, plots_path, file.name, experiment_number)
            base_data = pd.concat((base_data, new_data))
            experiment_number += 1

    base_data = base_data.groupby(base_data.index).mean()
    base_data.to_csv("{0}/mean_data.csv".format(experiment_mean_dir), index=False)

    generate_plot(base_data.index, base_data.CPU, "{0}/avg_CPU.png".format(experiment_mean_dir), function_name[1:] + " CPU")
    generate_plot(base_data.index, base_data.RAM, "{0}/avg_RAM.png".format(experiment_mean_dir), function_name[1:] + " RAM")

### Aggregating data from master node #20

In [None]:
master_dir = "./../experiments_data/preprocessed-data/master-mean-data"
create_directory(master_dir)

for directory in data_directories_groups[-1]:
    files_path = "./../experiments_data{0}".format(directory)
    p = Path(files_path)
    master_data = pd.DataFrame()

    master_experiment_directory = "{0}{1}".format(master_dir, directory[8:])
    master_plot_directory = "{0}/plots".format(files_path)
    create_directory(master_experiment_directory)
    create_directory(master_plot_directory)
    create_directory("{0}/CPU".format(master_plot_directory))
    create_directory("{0}/RAM".format(master_plot_directory))

    experiment_number = 1
    for file in p.glob('*.csv'):
        master_new_data, duration = preprocess_file(files_path, master_plot_directory, file.name, experiment_number)
        master_data = pd.concat((master_data, master_new_data))
        experiment_number += 1

    master_data = master_data.groupby(master_data.index).mean()
    master_data.to_csv("{0}/mean_data.csv".format(master_experiment_directory), index=False)

    generate_plot(master_data.index, master_data.CPU, "{0}/avg_CPU.png".format(master_experiment_directory))
    generate_plot(master_data.index, master_data.RAM, "{0}/avg_RAM.png".format(master_experiment_directory))


### Preprocessing spark internal metrics

Stage and task metrics

In [None]:
stage_metrics_all = pd.read_csv("./../experiments_data/stage_metrics.csv")
task_metrics_all = pd.read_csv("./../experiments_data/task_metrics.csv")

stage_metrics = stage_metrics_all[["function_name", "stage_id", "num_tasks", "executor_run_time", "result_size"]].copy()
stage_metrics["stage_time"]= stage_metrics_all["completion_time"] - stage_metrics_all["submission_time"]
stage_metrics = stage_metrics.groupby(["function_name", "stage_id"]).agg({
    "stage_time": "mean",
    "num_tasks": "max",
    "executor_run_time": "mean",
    "result_size": "mean"
})

task_metrics = task_metrics_all[["function_name", "stage_id", "task_type"]]
task_metrics = task_metrics.groupby(["function_name", "stage_id"]).agg({"task_type": "unique"})

spark_metrics = task_metrics.join(stage_metrics).reset_index().rename(columns={
    "stage_time": "mean_stage_time",
    "task_type": "task_types",
    "executor_run_time": "mean_executor_run_time",
    "result_size": "mean_result_size"
})

spark_metrics.to_csv("./../experiments_data/preprocessed-data/spark_metrics.csv", index=False)
spark_metrics

### Clear preprocessed data

In [None]:
for function_name in function_names:
    for node_dir in nodes_directories:
        try:
            shutil.rmtree('./../experiments_data{}{}/plots'.format(node_dir, function_name))
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

try:
    shutil.rmtree('./../experiments_data/preprocessed-data')
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

