In [8]:
import pandas as pd   
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os
import shutil
from datetime import datetime
from pathlib import Path
from tsmoothie.smoother import *
from statistics import mean

# Notebook for preprocessing data from experiments

Defined functions:

In [9]:
def get_subdirectories(directory=""):
    subdirectories = []
    p = Path("./../experiments_data/" + directory)
    for item in p.glob('*/'):
        if item.suffix not in (['.csv', '.zip']):
            subdirectories.append(directory + "/" + item.name)
    return subdirectories

def get_timestamp_info(data):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    start_time = data['timestamp'].min()
    stop_time = data['timestamp'].max()

    data['timestamp'] = pd.to_numeric(data['timestamp'])
    mean_interval = mean(data.diff(axis=0)['timestamp'][1:].tolist())
    mean_interval = round(mean_interval / 1000000000, 3)

    return (stop_time - start_time).total_seconds(), mean_interval

def preprocess_file(file_path, save_path, filename, iteration):
    read_data = pd.read_csv(file_path + "/" + filename)
    agg_read_data = read_data.\
        groupby('timestamp', as_index=False).\
        agg({"CPU": "sum", "RAM": "sum"})

    generate_plot(agg_read_data.index, agg_read_data.CPU, "{0}/CPU/{1}_CPU.png".format(save_path, iteration))
    generate_plot(agg_read_data.index, agg_read_data.RAM, "{0}/RAM/{1}_RAM.png".format(save_path, iteration))

    return agg_read_data, get_timestamp_info(agg_read_data)

def generate_plot(data_x="", data_y="", plot_path="", title=""):
    fig = plt.figure(figsize=(20,10))
    fig.patch.set_facecolor('white')
    plt.plot(data_x, data_y, 'r.-')
    plt.title(title)
    plt.savefig(plot_path)
    plt.close(fig)

def create_directory(path):
    try:
        os.mkdir(path)
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))

def smooth_data(mean_data, path, function_name):
        smoother = ConvolutionSmoother(window_len=7, window_type="ones")
        cpu_data = mean_data['CPU']
        ram_data = mean_data['RAM']

        smoother.smooth(cpu_data)
        smooth_cpu_data = smoother.smooth_data[0]
        smoother.smooth(ram_data)
        smooth_ram_data = smoother.smooth_data[0]

        generate_plot(data_x=mean_data.index,
                      data_y=smooth_cpu_data,
                      plot_path="{0}/smooth_mean_CPU.png".format(path),
                      title="Smoothed {0}".format(function_name))
        generate_plot(data_x=mean_data.index,
                      data_y=smooth_ram_data,
                      plot_path="{0}/smooth_mean_RAM.png".format(path),
                      title="Smoothed {0}".format(function_name))

        smooth_data = pd.DataFrame(list(zip(smooth_cpu_data, smooth_ram_data)), columns=['CPU', 'RAM'])
        smooth_data.to_csv("{0}/smooth_mean_data.csv".format(path), index=False)

Check for subdirectories. Number of subdirectories should much the number of nodes used for experiments.

In [10]:
all_directories = get_subdirectories()
nodes_directories = [x for x in all_directories if "node" in x]
nodes_directories

['/node_17',
 '/node_20',
 '/node_13',
 '/node_18',
 '/node_19',
 '/node_16',
 '/node_11',
 '/node_15',
 '/node_14',
 '/node_12']

In [11]:
data_directories = []
data_directories_groups = []
for directory in nodes_directories:
    cur_node_subdirectories = get_subdirectories(directory)
    data_directories.append(cur_node_subdirectories)

data_directories_groups = data_directories
data_directories = [item for sublist in data_directories for item in sublist]
data_directories_groups

[['/node_17/minNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_17/countNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_17/sumNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_17/maxNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_17/avgNetProfitGroupedBySoldDateWhereProfitNegative'],
 ['/node_20/minNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_20/countNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_20/sumNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_20/maxNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_20/avgNetProfitGroupedBySoldDateWhereProfitNegative'],
 ['/node_13/minNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_13/countNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_13/sumNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_13/maxNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_13/avgNetProfitGroupedBySoldDateWhereProfitNegative'],
 ['/node_18/minNetProfitGroupedBySoldDateWhere

Check for function names in each node directory.
These names should match the names in `experiments-plan.csv`.


In [12]:
function_names = data_directories_groups[1]
function_names = list(map(lambda x: x[8:], data_directories_groups[1]))
function_names

['/minNetProfitGroupedBySoldDateWhereProfitNegative',
 '/countNetProfitGroupedBySoldDateWhereProfitNegative',
 '/sumNetProfitGroupedBySoldDateWhereProfitNegative',
 '/maxNetProfitGroupedBySoldDateWhereProfitNegative',
 '/avgNetProfitGroupedBySoldDateWhereProfitNegative']

### Aggregating data from all nodes (11-19)

In [20]:
create_directory("./../experiments_data/preprocessed-data")

mean_dir = "./../experiments_data/preprocessed-data/workers-mean-data"
create_directory(mean_dir)
workers_summary = pd.DataFrame(columns=["function_name",
                                             "mean_duration",
                                             "mean_interval",
                                             "nodes_durations",
                                             "nodes_intervals"])

for function_name in function_names:
    base_data = pd.DataFrame()
    experiment_duration = []
    experiment_interval = []
    nodes_durations = []
    nodes_intervals = []

    experiment_mean_dir = "{0}{1}".format(mean_dir, function_name)
    create_directory(experiment_mean_dir)

    for node_dir in nodes_directories:
        file_path = "./../experiments_data{0}{1}".format(node_dir, function_name)
        p = Path(file_path)
        plots_path = "{0}/plots".format(file_path)

        create_directory(plots_path)
        create_directory(plots_path + "/RAM")
        create_directory(plots_path + "/CPU")

        experiment_number = 1
        node_intervals = []
        node_durations = []

        for file in p.glob('*.csv'):
            new_data, (duration, interval) = preprocess_file(file_path, plots_path, file.name, experiment_number)
            base_data = pd.concat((base_data, new_data))

            experiment_duration.append(duration)
            experiment_interval.append(interval)
            node_intervals.append(interval)
            node_durations.append(duration)

            experiment_number += 1

        print("{0} | {1} intervals: {2}".format(function_name[1:], node_dir[1:], node_durations))

        nodes_intervals.append((node_dir[1:], round(mean(node_intervals), 3)))
        nodes_durations.append((node_dir[1:], round(mean(node_durations), 3)))

    base_data = base_data.groupby(base_data.index).mean()
    base_data.to_csv("{0}/mean_data.csv".format(experiment_mean_dir), index=False)

    generate_plot(base_data.index, base_data.CPU, "{0}/mean_CPU.png".format(experiment_mean_dir), function_name[1:] + " CPU")
    generate_plot(base_data.index, base_data.RAM, "{0}/mean_RAM.png".format(experiment_mean_dir), function_name[1:] + " RAM")

    smooth_data(base_data, experiment_mean_dir, function_name[:])
    workers_summary = workers_summary.append({"function_name": function_name[1:],
                                                        "mean_duration": round(mean(experiment_duration), 3),
                                                        "mean_interval": round(mean(experiment_interval), 3),
                                                        "nodes_durations": nodes_durations,
                                                        "nodes_intervals": nodes_intervals}, ignore_index=True)

workers_summary.to_csv("{0}/experiments_mean_duration.csv".format(mean_dir), index=False)

### Aggregating data from master node #20

In [13]:
create_directory("./../experiments_data/preprocessed-data")

master_dir = "./../experiments_data/preprocessed-data/master-mean-data"
create_directory(master_dir)
master_mean_summary = pd.DataFrame(columns=["function_name",
                                            "mean_duration",
                                            "mean_interval"])

print("Progress:")
for directory in data_directories_groups[-1]:
    files_path = "./../experiments_data{0}".format(directory)
    p = Path(files_path)
    master_data = pd.DataFrame()

    experiment_durations = []
    experiment_intervals = []

    master_experiment_directory = "{0}{1}".format(master_dir, directory[8:])
    master_plot_directory = "{0}/plots".format(files_path)

    create_directory(master_experiment_directory)
    create_directory(master_plot_directory)
    create_directory("{0}/CPU".format(master_plot_directory))
    create_directory("{0}/RAM".format(master_plot_directory))

    experiment_number = 1
    for file in p.glob('*.csv'):
        master_new_data, (duration, interval) = preprocess_file(files_path, master_plot_directory, file.name, experiment_number)
        master_data = pd.concat((master_data, master_new_data))
        experiment_durations.append(duration)
        experiment_intervals.append(interval)
        experiment_number += 1

    master_mean_summary = master_mean_summary.append({"function_name": directory[9:],
                                                      "mean_duration": round(mean(experiment_durations), 3),
                                                      "mean_interval": round(mean(experiment_intervals), 3)},
                                                     ignore_index=True)

    master_data = master_data.groupby(master_data.index).mean()
    master_data.to_csv("{0}/mean_data.csv".format(master_experiment_directory), index=False)

    generate_plot(master_data.index, master_data.CPU, "{0}/mean_CPU.png".format(master_experiment_directory))
    generate_plot(master_data.index, master_data.RAM, "{0}/mean_RAM.png".format(master_experiment_directory))

    smooth_data(master_data, master_experiment_directory, directory[9:])
    print("- {0} - done".format(directory[9:]))

master_mean_summary.to_csv("{0}/experiments_mean_duration.csv".format(master_dir), index=False)

Progress:
- minNetProfitGroupedBySoldDateWhereProfitNegative - done
- countNetProfitGroupedBySoldDateWhereProfitNegative - done
- sumNetProfitGroupedBySoldDateWhereProfitNegative - done
- maxNetProfitGroupedBySoldDateWhereProfitNegative - done
- avgNetProfitGroupedBySoldDateWhereProfitNegative - done


### Preprocessing spark internal metrics

Stage and task metrics

In [119]:
stage_metrics_all = pd.read_csv("./../experiments_data/stage_metrics.csv")
task_metrics_all = pd.read_csv("./../experiments_data/task_metrics.csv")

stage_metrics = stage_metrics_all[["function_name", "stage_id", "num_tasks", "executor_run_time", "result_size"]].copy()
stage_metrics["stage_time"]= stage_metrics_all["completion_time"] - stage_metrics_all["submission_time"]
stage_metrics = stage_metrics.groupby(["function_name", "stage_id"]).agg({
    "stage_time": "mean",
    "num_tasks": "max",
    "executor_run_time": "mean",
    "result_size": "mean"
})

task_metrics = task_metrics_all[["function_name", "stage_id", "task_type"]]
task_metrics = task_metrics.groupby(["function_name", "stage_id"]).agg({"task_type": "unique"})

spark_metrics = task_metrics.join(stage_metrics).reset_index().rename(columns={
    "stage_time": "mean_stage_time",
    "task_type": "task_types",
    "executor_run_time": "mean_executor_run_time",
    "result_size": "mean_result_size"
})

spark_metrics.to_csv("./../experiments_data/preprocessed-data/spark_metrics.csv", index=False)
spark_metrics

Unnamed: 0,function_name,stage_id,task_types,mean_stage_time,num_tasks,mean_executor_run_time,mean_result_size
0,filterCatalogSalesWhereProfitNegative,0,[ResultTask],8648.6,1,5320.68,1474.0
1,filterCatalogSalesWhereProfitNegativeAndYearAf...,0,[ShuffleMapTask],4451.92,1,1138.28,2046.32
2,filterCatalogSalesWhereProfitNegativeAndYearAf...,1,[ShuffleMapTask],9042.4,1,5856.4,2032.36
3,filterCatalogSalesWhereProfitNegativeAndYearAf...,2,[ResultTask],4889.16,200,2714.2,820732.44
4,filterCatalogSalesWhereYearAfter2000,0,[ShuffleMapTask],5665.64,1,3988.4,1995.8
5,filterCatalogSalesWhereYearAfter2000,1,[ShuffleMapTask],4416.72,1,1827.52,2019.08
6,filterCatalogSalesWhereYearAfter2000,2,[ResultTask],7334.56,200,3117.16,808346.2
7,minWholeSaleCostGroupedBySoldDate,0,[ShuffleMapTask],73356.904762,1,69276.612245,2117.789116
8,minWholeSaleCostGroupedBySoldDate,1,[ResultTask],622.564626,200,475.836735,0.0


### Clear preprocessed data

In [7]:
for function_name in function_names:
    for node_dir in nodes_directories:
        try:
            shutil.rmtree('./../experiments_data{}{}/plots'.format(node_dir, function_name))
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

try:
    shutil.rmtree('./../experiments_data/preprocessed-data')
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))



Error: ./../experiments_data/node_17/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/node_20/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/node_13/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/node_18/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/node_19/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/node_16/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/node_11/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/node_15/minNetProfitGroupedBySoldDateWhereProfitNegative/plots - No such file or directory.
Error: ./../experiments_data/nod