In [1]:
import pandas as pd   
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os
import shutil
import gc
from pathlib import Path
from tsmoothie.smoother import *
from statistics import mean


# Notebook for preprocessing data from experiments

Defined functions:

In [2]:
def get_subdirectories(directory=""):
    subdirectories = []
    p = Path("./../experiments_data/" + directory)
    for item in p.glob('*/'):
        if item.suffix not in (['.csv', '.zip']):
            subdirectories.append(directory + "/" + item.name)
    return subdirectories

def get_timestamp_info(data):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    start_time = data['timestamp'].min()
    stop_time = data['timestamp'].max()

    data['timestamp'] = pd.to_numeric(data['timestamp'])
    mean_interval = mean(data.diff(axis=0)['timestamp'][1:].tolist())
    mean_interval = round(mean_interval / 1000000000, 3)

    return (stop_time - start_time).total_seconds(), mean_interval

def preprocess_file(file_path, save_path, filename, iteration):
    read_data = pd.read_csv(file_path + "/" + filename)
    agg_read_data = read_data.\
        groupby('timestamp', as_index=False).\
        agg({"CPU": "sum", "RAM": "sum"})

    generate_plot(agg_read_data.index, agg_read_data.CPU, "{0}/CPU/{1}_CPU.png".format(save_path, iteration))
    generate_plot(agg_read_data.index, agg_read_data.RAM, "{0}/RAM/{1}_RAM.png".format(save_path, iteration))

    return agg_read_data, get_timestamp_info(agg_read_data)

def generate_plot(data_x="", data_y="", plot_path="", title=""):
    fig = plt.figure(figsize=(20,10))
    fig.patch.set_facecolor('white')
    plt.plot(data_x, data_y, 'r.-')
    plt.title(title)
    plt.savefig(plot_path)
    plt.close(fig)

def create_directory(path):
    try:
        os.mkdir(path)
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))

def smooth_data(mean_data, path, function_name):
        smoother = ConvolutionSmoother(window_len=7, window_type="ones")
        cpu_data = mean_data['CPU']
        ram_data = mean_data['RAM']

        smoother.smooth(cpu_data)
        smooth_cpu_data = smoother.smooth_data[0]
        smoother.smooth(ram_data)
        smooth_ram_data = smoother.smooth_data[0]

        generate_plot(data_x=mean_data.index,
                      data_y=smooth_cpu_data,
                      plot_path="{0}/smooth_mean_CPU.png".format(path),
                      title="Smoothed {0}".format(function_name))
        generate_plot(data_x=mean_data.index,
                      data_y=smooth_ram_data,
                      plot_path="{0}/smooth_mean_RAM.png".format(path),
                      title="Smoothed {0}".format(function_name))

        smooth_data = pd.DataFrame(list(zip(smooth_cpu_data, smooth_ram_data)), columns=['CPU', 'RAM'])
        smooth_data.to_csv("{0}/smooth_mean_data.csv".format(path), index=False)

Check for subdirectories. Number of subdirectories should much the number of nodes used for experiments.

In [3]:
all_directories = get_subdirectories()
nodes_directories = [x for x in all_directories if "node" in x]
nodes_directories

['/node_11',
 '/node_12',
 '/node_13',
 '/node_14',
 '/node_15',
 '/node_16',
 '/node_17',
 '/node_18',
 '/node_19',
 '/node_20']

In [4]:
data_directories = []
data_directories_groups = []
for directory in nodes_directories:
    cur_node_subdirectories = get_subdirectories(directory)
    data_directories.append(cur_node_subdirectories)

data_directories_groups = data_directories
data_directories = [item for sublist in data_directories for item in sublist]
data_directories_groups

[['/node_11/avgNetProfitGroupedBySoldDate',
  '/node_11/avgNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_11/avgNetProfitGroupedBySoldDateWhereYearAfter2000',
  '/node_11/avgWholeSaleCostGroupedBySoldDate',
  '/node_11/countDistinctTicketNumber',
  '/node_11/countNetProfitGroupedBySoldDate',
  '/node_11/countNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_11/countNetProfitGroupedBySoldDateWhereYearAfter2000',
  '/node_11/countWholeSaleCostGroupedBySoldDate',
  '/node_11/filterCatalogSalesWhereProfitNegative',
  '/node_11/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
  '/node_11/filterCatalogSalesWhereYearAfter2000',
  '/node_11/filterStoreSalesWhereProfitNegative',
  '/node_11/filterStoreSalesWhereProfitNegativeAndYearAfter2000',
  '/node_11/filterStoreSalesWhereYearAfter2000',
  '/node_11/maxNetProfitGroupedBySoldDate',
  '/node_11/maxNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_11/maxNetProfitGroupedBySoldDateWhereYearAfter2000',
  '/node_11/m

Check for function names in each node directory.
These names should match the names in `experiments-plan.csv`.


In [5]:
function_names = data_directories_groups[1]
function_names = list(map(lambda x: x[8:], data_directories_groups[1]))
function_names

['/avgNetProfitGroupedBySoldDate',
 '/avgNetProfitGroupedBySoldDateWhereProfitNegative',
 '/avgNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/avgWholeSaleCostGroupedBySoldDate',
 '/countDistinctTicketNumber',
 '/countNetProfitGroupedBySoldDate',
 '/countNetProfitGroupedBySoldDateWhereProfitNegative',
 '/countNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/countWholeSaleCostGroupedBySoldDate',
 '/filterCatalogSalesWhereProfitNegative',
 '/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
 '/filterCatalogSalesWhereYearAfter2000',
 '/filterStoreSalesWhereProfitNegative',
 '/filterStoreSalesWhereProfitNegativeAndYearAfter2000',
 '/filterStoreSalesWhereYearAfter2000',
 '/maxNetProfitGroupedBySoldDate',
 '/maxNetProfitGroupedBySoldDateWhereProfitNegative',
 '/maxNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/maxWholeSaleCostGroupedBySoldDate',
 '/minNetProfitGroupedBySoldDate',
 '/minNetProfitGroupedBySoldDateWhereProfitNegative',
 '/minNetProfitGroupedBySoldDateWhereYearAfter2

### Aggregating data from all nodes (11-19)

In [7]:
master_dir_index = nodes_directories.index('/node_20')
workers_directories = nodes_directories.copy()
del workers_directories[master_dir_index]

workers_directories

['/node_11',
 '/node_12',
 '/node_13',
 '/node_14',
 '/node_15',
 '/node_16',
 '/node_17',
 '/node_18',
 '/node_19']

In [8]:
create_directory("./../experiments_data/preprocessed-data")

mean_dir = "./../experiments_data/preprocessed-data/workers-mean-data"
create_directory(mean_dir)
workers_summary = pd.DataFrame(columns=["function_name",
                                             "mean_duration",
                                             "mean_interval",
                                             "nodes_durations",
                                             "nodes_intervals"])

workers_summary.to_csv("{0}/experiments_mean_duration.csv".format(mean_dir), index=False)

In [9]:
def preprocess_selected_function(f_names, workers_summary):
    for function_name in f_names:
        base_data = pd.DataFrame()
        experiment_duration = []
        experiment_interval = []
        nodes_durations = []
        nodes_intervals = []

        experiment_mean_dir = "{0}{1}".format(mean_dir, function_name)
        create_directory(experiment_mean_dir)

        for node_dir in workers_directories:
            file_path = "./../experiments_data{0}{1}".format(node_dir, function_name)
            p = Path(file_path)
            plots_path = "{0}/plots".format(file_path)

            create_directory(plots_path)
            create_directory(plots_path + "/RAM")
            create_directory(plots_path + "/CPU")

            experiment_number = 1
            node_intervals = []
            node_durations = []

            for file in p.glob('*.csv'):
                new_data, (duration, interval) = preprocess_file(file_path, plots_path, file.name, experiment_number)
                base_data = pd.concat((base_data, new_data))

                experiment_duration.append(duration)
                experiment_interval.append(interval)
                node_intervals.append(interval)
                node_durations.append(duration)

                experiment_number += 1

            print("{0} | {1} durations: {2}".format(function_name[1:], node_dir[1:], node_durations))

            nodes_intervals.append((node_dir[1:], round(mean(node_intervals), 3)))
            nodes_durations.append((node_dir[1:], round(mean(node_durations), 3)))

        mean_interval = round(mean(experiment_interval), 3)

        base_data["timestamp"] = base_data.apply(lambda row: mean_interval * row.name, axis = 1)
        base_data = base_data.groupby(base_data.index).mean()
        base_data.to_csv("{0}/mean_data.csv".format(experiment_mean_dir), index=False)

        generate_plot(base_data.index, base_data.CPU, "{0}/mean_CPU.png".format(experiment_mean_dir), function_name[1:] + " CPU")
        generate_plot(base_data.index, base_data.RAM, "{0}/mean_RAM.png".format(experiment_mean_dir), function_name[1:] + " RAM")

        smooth_data(base_data, experiment_mean_dir, function_name)
        workers_summary = workers_summary.append({"function_name": function_name[1:],
                                                            "mean_duration": round(mean(experiment_duration), 3),
                                                            "mean_interval": mean_interval,
                                                            "nodes_durations": nodes_durations,
                                                            "nodes_intervals": nodes_intervals}, ignore_index=True)

    workers_summary.to_csv("{0}/experiments_mean_duration.csv".format(mean_dir), index=False, mode="a", header=False)

    del base_data
    del experiment_interval
    del experiment_duration
    del nodes_durations
    del nodes_intervals
    del new_data
    del node_intervals
    del node_durations
    del workers_summary
    gc.collect()

In [11]:
print(f"Functions: {len(function_names)}")

Functions: 29


In [12]:
preprocess_selected_function(function_names[0:4], workers_summary)

avgNetProfitGroupedBySoldDate | node_11 durations: [87.822398, 84.907424, 89.099357, 92.924271, 90.079271, 87.966551, 86.321541, 87.894025, 96.055341, 88.994081, 87.994335, 87.981834, 87.967983, 90.823126, 83.805095, 88.98613, 103.145599, 87.082761, 86.949464, 88.944837, 90.099927, 91.035731, 87.89128, 92.125014, 89.149373]
avgNetProfitGroupedBySoldDate | node_12 durations: [87.743202, 84.841107, 88.986009, 92.87524, 89.943532, 87.88085, 86.065347, 87.830646, 95.871999, 88.793743, 87.888862, 87.906723, 87.875244, 90.830752, 83.682032, 88.813933, 103.00797, 87.109781, 86.815343, 88.824717, 89.959431, 91.00031, 87.784691, 92.04767, 89.135653]
avgNetProfitGroupedBySoldDate | node_13 durations: [87.680449, 84.72492, 88.925269, 92.824557, 89.856521, 87.706383, 85.972686, 87.690253, 95.785617, 88.773838, 87.757352, 87.827618, 87.685697, 90.642284, 83.637552, 88.7594, 102.953407, 86.930823, 86.746311, 88.788489, 89.851092, 90.948315, 87.676418, 91.89312, 89.078901]
avgNetProfitGroupedBySoldDa

In [13]:
gc.collect()

4984822

In [14]:
preprocess_selected_function(function_names[4:8], workers_summary)

countDistinctTicketNumber | node_11 durations: [119.615432, 169.66512, 150.642707, 119.347481, 177.201195, 117.461817, 138.665544, 167.926047, 124.360457, 121.237346, 125.470905, 142.61534, 174.384321, 151.413672, 183.159475, 154.724639, 167.942303, 136.478331, 122.650754, 178.237092, 166.845164, 115.219133, 127.364988, 115.329739, 126.579539]
countDistinctTicketNumber | node_12 durations: [119.51689, 169.582475, 150.544528, 119.104442, 177.085709, 117.333894, 138.535104, 167.875872, 124.338815, 121.240257, 125.429545, 142.561693, 174.189775, 151.290455, 183.08601, 154.687037, 167.79898, 136.505282, 122.420959, 178.23657, 166.792416, 115.182259, 127.278564, 115.3066, 126.530138]
countDistinctTicketNumber | node_13 durations: [119.476118, 169.457289, 150.468719, 118.987009, 176.93664, 117.298403, 138.469839, 167.7426, 124.137524, 121.18548, 125.374507, 142.364779, 174.136277, 151.243156, 183.013847, 154.657093, 167.737064, 136.425387, 122.425213, 178.192392, 166.658688, 115.120208, 127.

In [15]:
gc.collect()

4980643

In [16]:
preprocess_selected_function(function_names[8:12], workers_summary)

countWholeSaleCostGroupedBySoldDate | node_11 durations: [87.062889, 91.973628, 87.093185, 92.078315, 86.93812, 85.890847, 89.209882, 91.142132, 86.927334, 86.943104, 89.173888, 99.057538, 88.977095, 87.876296, 88.966295, 89.923414, 90.021583, 88.187119, 83.903123, 90.076528, 88.339159, 83.792643, 85.944721, 89.844022, 90.085871]
countWholeSaleCostGroupedBySoldDate | node_12 durations: [86.940694, 91.794177, 86.977401, 91.966715, 86.797568, 85.730242, 89.113905, 91.015367, 86.812101, 86.816074, 88.982929, 98.9568, 88.794799, 87.773494, 88.825902, 89.914074, 89.915989, 88.136118, 83.712505, 89.91519, 88.205459, 83.697733, 85.781906, 89.769428, 89.990733]
countWholeSaleCostGroupedBySoldDate | node_13 durations: [86.752741, 91.746285, 86.896553, 91.901919, 86.658424, 85.653866, 89.092905, 90.820267, 86.755846, 86.744293, 88.930312, 98.881885, 88.754753, 87.678111, 88.772118, 89.698823, 89.86668, 88.009599, 83.63461, 89.875837, 88.14397, 83.63066, 85.677841, 89.726062, 90.016438]
countWhol

In [17]:
gc.collect()

5059297

In [18]:
preprocess_selected_function(function_names[12:16], workers_summary)

filterStoreSalesWhereProfitNegative | node_11 durations: [28.457978, 28.306069, 29.09073, 28.299063, 28.296141, 28.14698, 26.291294, 26.273867, 27.139326, 28.142481, 27.058048, 27.227028, 27.157048, 29.244512, 28.143678, 27.358965, 26.25543, 28.084672, 26.25813, 28.13543, 28.091728, 28.150212, 27.035579, 29.089944, 29.246602]
filterStoreSalesWhereProfitNegative | node_12 durations: [28.380107, 28.207359, 28.997939, 28.207988, 28.10552, 28.095105, 26.209703, 26.194642, 26.961309, 28.071607, 26.983207, 27.126799, 27.032572, 29.207615, 28.066614, 27.27205, 26.188263, 28.05079, 26.129633, 28.062815, 28.051049, 28.087707, 26.987303, 28.994067, 29.267047]
filterStoreSalesWhereProfitNegative | node_13 durations: [28.166756, 28.100584, 28.962781, 28.023774, 28.028338, 27.876882, 25.859817, 26.009055, 26.902393, 27.871448, 26.936741, 26.936771, 26.9339, 29.125517, 27.873727, 27.244041, 26.001364, 27.880803, 26.000607, 27.954151, 27.991244, 28.032314, 26.937564, 28.969574, 29.077164]
filterStore

In [19]:
gc.collect()

4984525

In [20]:
preprocess_selected_function(function_names[16:20], workers_summary)

maxNetProfitGroupedBySoldDateWhereProfitNegative | node_11 durations: [91.017589, 92.906581, 88.186944, 87.23987, 93.09427, 86.102387, 81.352671, 90.087258, 92.122352, 88.95045, 92.888663, 93.984237, 93.003548, 82.828082, 86.913372, 85.919045, 90.109815, 89.122077, 91.020374, 93.368989, 93.143097, 84.148469, 88.961259, 91.964322, 87.076159]
maxNetProfitGroupedBySoldDateWhereProfitNegative | node_12 durations: [90.859902, 92.891222, 88.079784, 87.098624, 92.96578, 86.021493, 81.172767, 89.948286, 92.0293, 88.803851, 92.778571, 93.94938, 92.882382, 82.751776, 86.857093, 85.865702, 90.066139, 88.97966, 90.858267, 93.191883, 93.033706, 84.002706, 88.836891, 91.939831, 86.953755]
maxNetProfitGroupedBySoldDateWhereProfitNegative | node_13 durations: [90.705653, 92.816703, 87.99171, 87.074365, 92.973658, 85.819745, 81.15618, 89.853373, 91.901565, 88.640758, 92.68175, 93.935113, 92.67743, 82.638257, 86.742056, 85.664592, 89.873516, 88.846729, 90.861278, 93.164281, 92.97136, 83.861212, 88.77721

In [21]:
gc.collect()

4960026

In [22]:
preprocess_selected_function(function_names[20:24], workers_summary)

minNetProfitGroupedBySoldDateWhereProfitNegative | node_11 durations: [93.166971, 93.207447, 96.992611, 82.77677, 89.905805, 94.952717, 85.991355, 79.934533, 89.285473, 89.120706, 87.816385, 87.874629, 87.248448, 87.083491, 89.126807, 94.000364, 93.998703, 90.090427, 80.881657, 84.830854, 90.901664, 83.941054, 87.239084, 88.983936, 93.212161]
minNetProfitGroupedBySoldDateWhereProfitNegative | node_12 durations: [93.05562, 93.022727, 96.931532, 82.753459, 89.757736, 94.899422, 85.864797, 79.766972, 89.145196, 89.123384, 87.750201, 87.786797, 87.069469, 86.954444, 89.048635, 93.944071, 93.960874, 89.988657, 80.725846, 84.71445, 90.892245, 83.833642, 87.204329, 88.880437, 92.974443]
minNetProfitGroupedBySoldDateWhereProfitNegative | node_13 durations: [92.974422, 92.979844, 96.862849, 82.711652, 89.70752, 94.856169, 85.80587, 79.756116, 89.091096, 88.875202, 87.689411, 87.693978, 86.965194, 86.765502, 88.944158, 93.929087, 93.828026, 89.862921, 80.674056, 84.729711, 90.796402, 83.634941, 

In [23]:
gc.collect()

4999538

In [24]:
preprocess_selected_function(function_names[24:28], workers_summary)

summaryWholeSaleCostGroupedBySoldDate | node_11 durations: [104.254034, 104.468931, 110.183837, 108.184488, 104.36198, 105.366118, 112.274545, 107.087601, 121.24367, 106.305051, 107.39919, 111.33435, 109.29825, 103.275957, 108.186001, 110.21912, 106.373728, 112.40959, 106.139356, 104.368241, 105.202911, 106.143396, 108.281347, 104.101395, 105.369098]
summaryWholeSaleCostGroupedBySoldDate | node_12 durations: [104.15796, 104.225392, 110.1985, 108.125228, 104.22207, 105.253133, 112.187407, 107.043745, 121.156377, 106.169955, 107.246472, 111.102801, 109.249989, 103.15347, 108.136078, 110.253066, 106.292577, 112.355972, 106.038496, 104.240051, 105.020843, 106.099882, 108.289198, 104.081509, 105.210156]
summaryWholeSaleCostGroupedBySoldDate | node_13 durations: [104.060943, 104.193571, 110.109059, 107.93795, 104.06189, 105.146416, 111.975292, 107.012023, 121.168886, 106.053607, 107.223422, 111.040964, 109.11236, 103.105663, 107.919734, 110.129432, 106.209253, 112.150549, 105.909091, 104.196

In [25]:
gc.collect()

5012726

In [28]:
preprocess_selected_function(function_names[28:], workers_summary)

sumWholeSaleCostGroupedBySoldDate | node_11 durations: [87.180087, 87.112903, 85.997868, 88.057087, 92.139779, 87.976308, 86.127558, 87.886241, 88.431763, 84.839246, 84.971022, 89.005767, 92.042799, 93.012998, 81.288571, 84.91375, 91.037745, 90.097453, 93.073796, 93.065261, 86.960233, 86.243926, 89.905548, 89.940939, 93.075523]
sumWholeSaleCostGroupedBySoldDate | node_12 durations: [87.111375, 86.971526, 86.025645, 88.029324, 92.074515, 87.74115, 85.965545, 87.78974, 88.344879, 84.775759, 84.900303, 88.853711, 91.954652, 92.867575, 81.197116, 84.774016, 91.000492, 89.912984, 92.980888, 92.897283, 86.841714, 86.013987, 89.812168, 89.908776, 93.045756]
sumWholeSaleCostGroupedBySoldDate | node_13 durations: [87.075858, 86.890652, 85.818319, 87.911697, 92.037165, 87.680111, 85.814683, 87.679759, 88.153255, 84.7339, 84.67759, 88.767762, 91.761309, 92.826748, 81.059927, 84.720843, 90.895682, 89.858355, 92.836007, 92.83134, 86.73731, 85.958956, 89.719904, 89.704522, 92.839288]
sumWholeSaleCos

In [None]:
gc.collect()

### Aggregating data from master node #20

In [56]:
create_directory("./../experiments_data/preprocessed-data")

master_dir = "./../experiments_data/preprocessed-data/master-mean-data"
create_directory(master_dir)
master_mean_summary = pd.DataFrame(columns=["function_name",
                                            "mean_duration",
                                            "mean_interval"])

print("Progress:")
for function in function_names:
    files_path = "./../experiments_data/node_20{0}".format(function)
    p = Path(files_path)
    master_data = pd.DataFrame()

    experiment_durations = []
    experiment_intervals = []

    master_experiment_directory = "{0}{1}".format(master_dir, function)
    master_plot_directory = "{0}/plots".format(files_path)

    create_directory(master_experiment_directory)
    create_directory(master_plot_directory)
    create_directory("{0}/CPU".format(master_plot_directory))
    create_directory("{0}/RAM".format(master_plot_directory))

    experiment_number = 1
    for file in p.glob('*.csv'):
        master_new_data, (duration, interval) = preprocess_file(files_path, master_plot_directory, file.name, experiment_number)
        master_data = pd.concat((master_data, master_new_data))
        experiment_durations.append(duration)
        experiment_intervals.append(interval)
        experiment_number += 1

    master_mean_summary = master_mean_summary.append({"function_name": function[1:],
                                                      "mean_duration": round(mean(experiment_durations), 3),
                                                      "mean_interval": round(mean(experiment_intervals), 3)},
                                                     ignore_index=True)

    master_data = master_data.groupby(master_data.index).mean()
    master_data.to_csv("{0}/mean_data.csv".format(master_experiment_directory), index=False)

    generate_plot(master_data.index, master_data.CPU, "{0}/mean_CPU.png".format(master_experiment_directory))
    generate_plot(master_data.index, master_data.RAM, "{0}/mean_RAM.png".format(master_experiment_directory))

    smooth_data(master_data, master_experiment_directory, function[1:])
    print("- {0} - done".format(function[1:]))

    # Deleting variables
    del master_data
    del experiment_intervals
    del experiment_durations
    del master_new_data

master_mean_summary.to_csv("{0}/experiments_mean_duration.csv".format(master_dir), index=False)

del master_mean_summary
gc.collect()

Error: ./../experiments_data/preprocessed-data - Nie można utworzyć pliku, który już istnieje.
Progress:
- avgNetProfitGroupedBySoldDate - done
- avgNetProfitGroupedBySoldDateWhereProfitNegative - done
- avgWholeSaleCostGroupedBySoldDate - done
- countDistinctTicketNumber - done
- countNetProfitGroupedBySoldDate - done
- countNetProfitGroupedBySoldDateWhereProfitNegative - done
- countWholeSaleCostGroupedBySoldDate - done
- filterCatalogSalesWhereProfitNegative - done
- filterCatalogSalesWhereProfitNegativeAndYearAfter2000 - done
- filterCatalogSalesWhereYearAfter2000 - done
- filterStoreSalesWhereProfitNegative - done
- filterStoreSalesWhereProfitNegativeAndYearAfter2000 - done
- filterStoreSalesWhereYearAfter2000 - done
- maxNetProfitGroupedBySoldDate - done
- maxNetProfitGroupedBySoldDateWhereProfitNegative - done
- maxWholeSaleCostGroupedBySoldDate - done
- minNetProfitGroupedBySoldDate - done
- minNetProfitGroupedBySoldDateWhereProfitNegative - done
- minWholeSaleCostGroupedBySold

0

### Preprocessing spark internal metrics

Stage and task metrics

In [119]:
stage_metrics_all = pd.read_csv("./../experiments_data/stage_metrics.csv")
task_metrics_all = pd.read_csv("./../experiments_data/task_metrics.csv")

stage_metrics = stage_metrics_all[["function_name", "stage_id", "num_tasks", "executor_run_time", "result_size"]].copy()
stage_metrics["stage_time"]= stage_metrics_all["completion_time"] - stage_metrics_all["submission_time"]
stage_metrics = stage_metrics.groupby(["function_name", "stage_id"]).agg({
    "stage_time": "mean",
    "num_tasks": "max",
    "executor_run_time": "mean",
    "result_size": "mean"
})

task_metrics = task_metrics_all[["function_name", "stage_id", "task_type"]]
task_metrics = task_metrics.groupby(["function_name", "stage_id"]).agg({"task_type": "unique"})

spark_metrics = task_metrics.join(stage_metrics).reset_index().rename(columns={
    "stage_time": "mean_stage_time",
    "task_type": "task_types",
    "executor_run_time": "mean_executor_run_time",
    "result_size": "mean_result_size"
})

spark_metrics.to_csv("./../experiments_data/preprocessed-data/spark_metrics.csv", index=False)
spark_metrics

Unnamed: 0,function_name,stage_id,task_types,mean_stage_time,num_tasks,mean_executor_run_time,mean_result_size
0,filterCatalogSalesWhereProfitNegative,0,[ResultTask],8648.6,1,5320.68,1474.0
1,filterCatalogSalesWhereProfitNegativeAndYearAf...,0,[ShuffleMapTask],4451.92,1,1138.28,2046.32
2,filterCatalogSalesWhereProfitNegativeAndYearAf...,1,[ShuffleMapTask],9042.4,1,5856.4,2032.36
3,filterCatalogSalesWhereProfitNegativeAndYearAf...,2,[ResultTask],4889.16,200,2714.2,820732.44
4,filterCatalogSalesWhereYearAfter2000,0,[ShuffleMapTask],5665.64,1,3988.4,1995.8
5,filterCatalogSalesWhereYearAfter2000,1,[ShuffleMapTask],4416.72,1,1827.52,2019.08
6,filterCatalogSalesWhereYearAfter2000,2,[ResultTask],7334.56,200,3117.16,808346.2
7,minWholeSaleCostGroupedBySoldDate,0,[ShuffleMapTask],73356.904762,1,69276.612245,2117.789116
8,minWholeSaleCostGroupedBySoldDate,1,[ResultTask],622.564626,200,475.836735,0.0


### Clear preprocessed data

In [42]:
for function_name in function_names:
    for node_dir in nodes_directories:
        try:
            shutil.rmtree('./../experiments_data{}{}/plots'.format(node_dir, function_name))
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

try:
    shutil.rmtree('./../experiments_data/preprocessed-data')
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

Error: ./../experiments_data/node_20/avgNetProfitGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_20/avgNetProfitGroupedBySoldDateWhereProfitNegative/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_11/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_12/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_13/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_14/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_15/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_16/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżk