In [1]:
import pandas as pd   
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os
import shutil
import gc
from pathlib import Path
from tsmoothie.smoother import *
from statistics import mean

# Notebook for preprocessing data from experiments

Set working directory. Default is `experiments_data`.

In [35]:
working_directory = "1GB-9N"

In [36]:
def get_subdirectories(directory=""):
    subdirectories = []
    p = Path(f"./../{working_directory}/{directory}")
    for item in p.glob('*/'):
        if item.suffix not in (['.csv', '.zip']):
            subdirectories.append(directory + "/" + item.name)
    return subdirectories

def get_timestamp_info(data):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    start_time = data['timestamp'].min()
    stop_time = data['timestamp'].max()

    data['timestamp'] = pd.to_numeric(data['timestamp'])
    mean_interval = mean(data.diff(axis=0)['timestamp'][1:].tolist())
    mean_interval = round(mean_interval / 1000000000, 3)

    return (stop_time - start_time).total_seconds(), mean_interval

def preprocess_file(file_path, save_path, filename, iteration):
    read_data = pd.read_csv(file_path + "/" + filename)
    agg_read_data = read_data.\
        groupby('timestamp', as_index=False).\
        agg({"CPU": "sum", "RAM": "sum"})

    generate_plot(agg_read_data.index, agg_read_data.CPU, "{0}/CPU/{1}_CPU.png".format(save_path, iteration))
    generate_plot(agg_read_data.index, agg_read_data.RAM, "{0}/RAM/{1}_RAM.png".format(save_path, iteration))

    return agg_read_data, get_timestamp_info(agg_read_data)

def generate_plot(data_x="", data_y="", plot_path="", title=""):
    fig = plt.figure(figsize=(20,10))
    fig.patch.set_facecolor('white')
    plt.plot(data_x, data_y, 'r.-')
    plt.title(title)
    plt.savefig(plot_path)
    plt.close(fig)

def create_directory(path):
    try:
        os.mkdir(path)
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))

def smooth_data(mean_data, path, function_name):
        smoother = ConvolutionSmoother(window_len=7, window_type="ones")
        cpu_data = mean_data['CPU']
        ram_data = mean_data['RAM']

        smoother.smooth(cpu_data)
        smooth_cpu_data = smoother.smooth_data[0]
        smoother.smooth(ram_data)
        smooth_ram_data = smoother.smooth_data[0]

        generate_plot(data_x=mean_data.index,
                      data_y=smooth_cpu_data,
                      plot_path="{0}/smooth_mean_CPU.png".format(path),
                      title="Smoothed {0}".format(function_name))
        generate_plot(data_x=mean_data.index,
                      data_y=smooth_ram_data,
                      plot_path="{0}/smooth_mean_RAM.png".format(path),
                      title="Smoothed {0}".format(function_name))

        smooth_data = pd.DataFrame(list(zip(smooth_cpu_data, smooth_ram_data)), columns=['CPU', 'RAM'])
        smooth_data.to_csv("{0}/smooth_mean_data.csv".format(path), index=False)

Check for subdirectories. Number of subdirectories should much the number of nodes used for experiments.

In [37]:
all_directories = get_subdirectories()
nodes_directories = [x for x in all_directories if "node" in x]
nodes_directories

['/node_11',
 '/node_12',
 '/node_13',
 '/node_14',
 '/node_15',
 '/node_16',
 '/node_17',
 '/node_18',
 '/node_19',
 '/node_20']

In [38]:
data_directories = []
data_directories_groups = []
for directory in nodes_directories:
    cur_node_subdirectories = get_subdirectories(directory)
    data_directories.append(cur_node_subdirectories)

data_directories_groups = data_directories
data_directories = [item for sublist in data_directories for item in sublist]
data_directories_groups

[['/node_11/avgNetProfitGroupedBySoldDate',
  '/node_11/avgNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_11/avgNetProfitGroupedBySoldDateWhereYearAfter2000',
  '/node_11/avgWholeSaleCostGroupedBySoldDate',
  '/node_11/countNetProfitGroupedBySoldDate',
  '/node_11/countNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_11/countNetProfitGroupedBySoldDateWhereYearAfter2000',
  '/node_11/countWholeSaleCostGroupedBySoldDate',
  '/node_11/filterCatalogSalesWhereProfitNegative',
  '/node_11/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
  '/node_11/filterCatalogSalesWhereYearAfter2000',
  '/node_11/filterStoreSalesWhereProfitNegative',
  '/node_11/filterStoreSalesWhereProfitNegativeAndYearAfter2000',
  '/node_11/filterStoreSalesWhereYearAfter2000',
  '/node_11/maxNetProfitGroupedBySoldDate',
  '/node_11/maxNetProfitGroupedBySoldDateWhereProfitNegative',
  '/node_11/maxNetProfitGroupedBySoldDateWhereYearAfter2000',
  '/node_11/maxWholeSaleCostGroupedBySoldDate',
  '/n

Check for function names in each node directory.
These names should match the names in `experiments-plan.csv`.


In [39]:
function_names = data_directories_groups[1]
function_names = list(map(lambda x: x[8:], data_directories_groups[1]))
function_names

['/avgNetProfitGroupedBySoldDate',
 '/avgNetProfitGroupedBySoldDateWhereProfitNegative',
 '/avgNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/avgWholeSaleCostGroupedBySoldDate',
 '/countNetProfitGroupedBySoldDate',
 '/countNetProfitGroupedBySoldDateWhereProfitNegative',
 '/countNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/countWholeSaleCostGroupedBySoldDate',
 '/filterCatalogSalesWhereProfitNegative',
 '/filterCatalogSalesWhereProfitNegativeAndYearAfter2000',
 '/filterCatalogSalesWhereYearAfter2000',
 '/filterStoreSalesWhereProfitNegative',
 '/filterStoreSalesWhereProfitNegativeAndYearAfter2000',
 '/filterStoreSalesWhereYearAfter2000',
 '/maxNetProfitGroupedBySoldDate',
 '/maxNetProfitGroupedBySoldDateWhereProfitNegative',
 '/maxNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/maxWholeSaleCostGroupedBySoldDate',
 '/minNetProfitGroupedBySoldDate',
 '/minNetProfitGroupedBySoldDateWhereProfitNegative',
 '/minNetProfitGroupedBySoldDateWhereYearAfter2000',
 '/minWholeSaleCostGroupe

### Aggregating data from all nodes (11-19)

In [40]:
master_dir_index = nodes_directories.index('/node_20')
workers_directories = nodes_directories.copy()
del workers_directories[master_dir_index]

workers_directories

['/node_11',
 '/node_12',
 '/node_13',
 '/node_14',
 '/node_15',
 '/node_16',
 '/node_17',
 '/node_18',
 '/node_19']

In [41]:
create_directory(f"./../{working_directory}/preprocessed-data")

mean_dir = f"./../{working_directory}/preprocessed-data/workers-mean-data"
create_directory(mean_dir)
workers_summary = pd.DataFrame(columns=["function_name",
                                             "mean_duration",
                                             "mean_interval",
                                             "nodes_durations",
                                             "nodes_intervals"])

workers_summary.to_csv("{0}/experiments_mean_duration.csv".format(mean_dir), index=False)

In [45]:
def preprocess_selected_function(f_names, workers_summary):
    for function_name in f_names:
        base_data = pd.DataFrame()
        experiment_duration = []
        experiment_interval = []
        nodes_durations = []
        nodes_intervals = []

        experiment_mean_dir = "{0}{1}".format(mean_dir, function_name)
        create_directory(experiment_mean_dir)

        for node_dir in workers_directories:
            file_path = f"./../{working_directory}{node_dir}{function_name}"
            p = Path(file_path)
            plots_path = "{0}/plots".format(file_path)

            create_directory(plots_path)
            create_directory(plots_path + "/RAM")
            create_directory(plots_path + "/CPU")

            experiment_number = 1
            node_intervals = []
            node_durations = []

            for file in p.glob('*.csv'):
                new_data, (duration, interval) = preprocess_file(file_path, plots_path, file.name, experiment_number)
                base_data = pd.concat((base_data, new_data))

                experiment_duration.append(duration)
                experiment_interval.append(interval)
                node_intervals.append(interval)
                node_durations.append(duration)

                experiment_number += 1

            print("{0} | {1} durations: {2}".format(function_name[1:], node_dir[1:], node_durations))

            nodes_intervals.append((node_dir[1:], round(mean(node_intervals), 3)))
            nodes_durations.append((node_dir[1:], round(mean(node_durations), 3)))

        mean_interval = round(mean(experiment_interval), 3)

        base_data["timestamp"] = base_data.apply(lambda row: mean_interval * row.name, axis = 1)
        base_data = base_data.groupby(base_data.index).mean()
        base_data.to_csv("{0}/mean_data.csv".format(experiment_mean_dir), index=False)

        generate_plot(base_data.index, base_data.CPU, "{0}/mean_CPU.png".format(experiment_mean_dir), function_name[1:] + " CPU")
        generate_plot(base_data.index, base_data.RAM, "{0}/mean_RAM.png".format(experiment_mean_dir), function_name[1:] + " RAM")

        # smooth_data(base_data, experiment_mean_dir, function_name)
        workers_summary = workers_summary.append({"function_name": function_name[1:],
                                                            "mean_duration": round(mean(experiment_duration), 3),
                                                            "mean_interval": mean_interval,
                                                            "nodes_durations": nodes_durations,
                                                            "nodes_intervals": nodes_intervals}, ignore_index=True)

    workers_summary.to_csv("{0}/experiments_mean_duration.csv".format(mean_dir), index=False, mode="a", header=False)

    del base_data
    del experiment_interval
    del experiment_duration
    del nodes_durations
    del nodes_intervals
    del new_data
    del node_intervals
    del node_durations
    del workers_summary
    gc.collect()

In [46]:
print(f"Functions: {len(function_names)}")

Functions: 28


In [47]:
preprocess_selected_function(function_names[0:4], workers_summary)

Error: ./../1GB-9N/preprocessed-data/workers-mean-data/avgNetProfitGroupedBySoldDate - Nie można utworzyć pliku, który już istnieje.
avgNetProfitGroupedBySoldDate | node_11 durations: [88.746793, 87.15337, 87.798018, 91.110898, 77.578098, 91.750292, 92.834669, 118.332791, 120.522102, 88.899015, 85.130526, 121.157775, 114.074639, 85.923986, 119.115364, 85.905887, 88.751661, 120.546455, 87.801847, 85.915134, 85.760761, 88.898936, 119.273165, 89.820416, 88.922587]
avgNetProfitGroupedBySoldDate | node_12 durations: [88.669688, 86.969832, 87.716031, 91.017339, 77.493691, 91.635009, 92.675926, 118.225313, 120.402587, 88.68349, 84.94376, 121.017818, 114.042143, 85.890112, 119.007348, 85.882583, 88.677782, 120.40188, 87.74126, 85.878489, 85.733567, 88.849275, 119.145433, 89.770635, 88.83798]
avgNetProfitGroupedBySoldDate | node_13 durations: [88.575387, 86.853412, 87.649073, 91.063528, 77.37734, 91.680124, 92.612891, 118.104082, 120.311037, 88.567847, 84.82644, 120.945591, 113.828122, 85.78722

In [48]:
gc.collect()

4920196

In [49]:
preprocess_selected_function(function_names[4:8], workers_summary)

countNetProfitGroupedBySoldDate | node_11 durations: [88.738044, 86.969684, 119.584273, 76.784317, 114.074945, 87.014826, 93.150079, 90.953185, 80.64924, 85.921963, 87.792424, 119.272997, 86.871016, 88.913084, 88.919853, 76.643492, 95.993718, 86.705752, 86.853002, 119.272186, 80.663067, 84.903742, 86.860521, 76.668114, 87.807851]
countNetProfitGroupedBySoldDate | node_12 durations: [88.697097, 86.828443, 119.462564, 76.709481, 114.045731, 86.817264, 93.036276, 90.863301, 80.585207, 85.736373, 87.748093, 119.142781, 86.649107, 88.831991, 88.75468, 76.554366, 96.005775, 86.658411, 86.647432, 119.155276, 80.62269, 84.793746, 86.812396, 76.536631, 87.758605]
countNetProfitGroupedBySoldDate | node_13 durations: [88.561652, 86.705133, 119.422304, 76.574038, 113.966008, 86.705314, 93.031098, 90.76121, 80.516586, 85.772517, 87.631529, 119.080566, 86.552554, 88.748862, 88.584643, 76.425512, 95.863903, 86.463269, 86.545012, 119.069816, 80.472312, 84.674609, 86.679846, 76.434462, 87.642321]
count

In [50]:
gc.collect()

4926520

In [51]:
preprocess_selected_function(function_names[8:12], workers_summary)

filterCatalogSalesWhereProfitNegative | node_11 durations: [22.046945, 22.338127, 22.025271, 22.030013, 22.187334, 22.111067, 22.0221, 22.179276, 25.184001, 22.025745, 22.179818, 22.025276, 20.910819, 22.182145, 25.013706, 21.072974, 24.383193, 24.087425, 23.14857, 22.971371, 22.1712, 22.975104, 25.023002, 21.092953, 22.345256]
filterCatalogSalesWhereProfitNegative | node_12 durations: [21.945781, 22.175775, 21.967119, 21.840377, 22.12024, 22.118965, 21.956419, 22.109856, 25.059756, 21.944843, 22.108499, 21.944972, 20.858602, 22.10473, 24.908317, 21.074387, 24.198351, 23.96808, 23.041345, 22.884539, 22.104998, 22.945387, 24.97601, 20.862401, 22.187948]
filterCatalogSalesWhereProfitNegative | node_13 durations: [21.85258, 22.116176, 21.850943, 21.810703, 21.975564, 21.951656, 21.812528, 21.958902, 24.99211, 21.802665, 22.013286, 21.802227, 20.868168, 21.973621, 24.914273, 20.876255, 24.146143, 23.825162, 23.058082, 22.899984, 21.959994, 22.754309, 24.764592, 20.880217, 22.163134]
filter

In [52]:
gc.collect()

5058279

In [53]:
preprocess_selected_function(function_names[12:16], workers_summary)

filterStoreSalesWhereProfitNegativeAndYearAfter2000 | node_11 durations: [31.324907, 38.233243, 32.260057, 32.100293, 32.108474, 32.103139, 32.271414, 32.098345, 33.208976, 38.243107, 33.197188, 34.156858, 32.086542, 34.297735, 32.068076, 32.215758, 36.34493, 32.12056, 32.254742, 37.604413, 32.094517, 33.221824, 38.225943, 31.152463, 29.120499]
filterStoreSalesWhereProfitNegativeAndYearAfter2000 | node_12 durations: [31.290682, 38.119947, 32.219913, 32.107101, 32.072382, 32.024024, 32.212334, 32.065737, 33.152646, 38.130508, 32.990822, 34.090887, 32.006122, 34.077633, 31.90792, 32.067222, 36.252, 32.008576, 32.059986, 37.446197, 31.950786, 33.115435, 38.089071, 31.07603, 28.953024]
filterStoreSalesWhereProfitNegativeAndYearAfter2000 | node_13 durations: [31.136743, 37.936879, 32.0841, 31.875794, 31.916186, 31.854854, 32.131903, 31.860662, 32.978407, 37.977688, 33.005743, 33.942226, 31.919223, 34.095982, 31.884109, 31.921892, 36.23034, 31.915565, 32.004905, 37.27731, 31.91486, 33.011052

In [63]:
gc.collect()

104

In [55]:
preprocess_selected_function(function_names[16:20], workers_summary)

maxNetProfitGroupedBySoldDateWhereYearAfter2000 | node_11 durations: [87.967959, 84.982727, 89.856406, 84.828804, 84.800149, 87.034703, 84.842478, 85.892316, 93.463474, 89.998728, 95.998959, 114.237541, 85.927445, 115.22138, 107.974284, 115.212557, 112.987367, 114.408837, 117.558652, 88.922311, 117.077262, 114.251791, 92.859967, 85.75609, 93.835355]
maxNetProfitGroupedBySoldDateWhereYearAfter2000 | node_12 durations: [87.963924, 84.875943, 89.680869, 84.711123, 84.726209, 86.887573, 84.796599, 85.77157, 93.417431, 89.837752, 95.917396, 114.130428, 85.803061, 115.071909, 107.818517, 115.231823, 113.029551, 114.252741, 117.40492, 88.936943, 117.06432, 113.95177, 92.863538, 85.732152, 93.862691]
maxNetProfitGroupedBySoldDateWhereYearAfter2000 | node_13 durations: [87.880157, 84.733467, 89.572005, 84.574493, 84.577233, 86.834857, 84.721907, 85.669592, 93.326251, 89.87972, 95.788409, 113.999118, 85.685733, 115.065247, 107.84379, 115.06759, 112.903474, 114.184419, 117.278419, 88.779141, 116.

In [56]:
gc.collect()

4928938

In [57]:
preprocess_selected_function(function_names[20:24], workers_summary)

minNetProfitGroupedBySoldDateWhereYearAfter2000 | node_11 durations: [94.09699, 116.312745, 86.879763, 96.092777, 92.869645, 90.15333, 81.987514, 118.64019, 115.034231, 116.13745, 94.894594, 113.151107, 87.835642, 93.966197, 119.138702, 96.95351, 127.456563, 86.860129, 110.298711, 92.825979, 85.977646, 81.68446, 82.781335, 112.023962, 84.812339]
minNetProfitGroupedBySoldDateWhereYearAfter2000 | node_12 durations: [93.894937, 116.137993, 86.878379, 95.910857, 92.803716, 90.136234, 81.758293, 118.475228, 114.901606, 115.997743, 94.83839, 113.050598, 87.662259, 93.755446, 119.08154, 96.806781, 127.35251, 86.72395, 110.150383, 92.813517, 85.855461, 81.625567, 82.694293, 111.882757, 84.636315]
minNetProfitGroupedBySoldDateWhereYearAfter2000 | node_13 durations: [93.925224, 116.066648, 86.74293, 95.796053, 92.686442, 90.026444, 81.738561, 118.49301, 114.812839, 115.885048, 94.659298, 112.763744, 87.534059, 93.63527, 118.866599, 96.741675, 127.353217, 86.626893, 110.038708, 92.67604, 85.84391

In [58]:
gc.collect()

4934527

In [59]:
preprocess_selected_function(function_names[24:28], workers_summary)

sumNetProfitGroupedBySoldDate | node_11 durations: [90.971476, 86.708836, 82.938553, 90.022286, 112.054318, 92.387879, 87.973353, 87.814172, 89.855791, 93.172355, 120.377918, 121.346454, 85.915915, 125.260064, 90.781606, 120.243832, 87.938611, 85.92753, 81.770384, 91.90376, 119.284323, 87.824239, 113.954311, 98.357588, 85.770704]
sumNetProfitGroupedBySoldDate | node_12 durations: [90.868947, 86.68725, 82.833273, 89.886375, 112.018613, 92.289112, 87.929761, 87.766406, 89.768332, 93.042027, 120.257782, 121.199856, 85.799199, 125.09044, 90.703193, 120.25012, 87.749615, 85.897456, 81.694052, 91.800602, 119.16723, 87.737281, 113.871952, 98.33263, 85.763992]
sumNetProfitGroupedBySoldDate | node_13 durations: [90.903426, 86.572165, 82.659532, 89.821139, 111.910653, 91.916749, 87.814029, 87.639001, 89.63837, 92.945582, 120.067905, 121.111858, 85.761196, 124.996932, 90.743778, 120.162591, 87.645318, 85.812894, 81.578361, 91.691025, 119.06858, 87.647164, 113.768506, 98.228545, 85.621661]
sumNetP

In [60]:
gc.collect()

4945014

In [61]:
preprocess_selected_function(function_names[28:], workers_summary)

UnboundLocalError: local variable 'base_data' referenced before assignment

In [None]:
gc.collect()

### Aggregating data from master node #20

In [62]:
create_directory(f"./../{working_directory}/preprocessed-data")

master_dir = f"./../{working_directory}/preprocessed-data/master-mean-data"
create_directory(master_dir)
master_mean_summary = pd.DataFrame(columns=["function_name",
                                            "mean_duration",
                                            "mean_interval"])

print("Progress:")
for function in function_names:
    files_path = f"./../{working_directory}/node_20{function}"
    p = Path(files_path)
    master_data = pd.DataFrame()

    experiment_durations = []
    experiment_intervals = []

    master_experiment_directory = "{0}{1}".format(master_dir, function)
    master_plot_directory = "{0}/plots".format(files_path)

    create_directory(master_experiment_directory)
    create_directory(master_plot_directory)
    create_directory("{0}/CPU".format(master_plot_directory))
    create_directory("{0}/RAM".format(master_plot_directory))

    experiment_number = 1
    for file in p.glob('*.csv'):
        master_new_data, (duration, interval) = preprocess_file(files_path, master_plot_directory, file.name, experiment_number)
        master_data = pd.concat((master_data, master_new_data))
        experiment_durations.append(duration)
        experiment_intervals.append(interval)
        experiment_number += 1

    master_mean_summary = master_mean_summary.append({"function_name": function[1:],
                                                      "mean_duration": round(mean(experiment_durations), 3),
                                                      "mean_interval": round(mean(experiment_intervals), 3)},
                                                     ignore_index=True)

    master_data = master_data.groupby(master_data.index).mean()
    master_data.to_csv("{0}/mean_data.csv".format(master_experiment_directory), index=False)

    generate_plot(master_data.index, master_data.CPU, "{0}/mean_CPU.png".format(master_experiment_directory))
    generate_plot(master_data.index, master_data.RAM, "{0}/mean_RAM.png".format(master_experiment_directory))

    smooth_data(master_data, master_experiment_directory, function[1:])
    print("- {0} - done".format(function[1:]))

    # Deleting variables
    del master_data
    del experiment_intervals
    del experiment_durations
    del master_new_data

master_mean_summary.to_csv("{0}/experiments_mean_duration.csv".format(master_dir), index=False)

del master_mean_summary
gc.collect()

Error: ./../1GB-9N/preprocessed-data - Nie można utworzyć pliku, który już istnieje.
Progress:
Error: ./../1GB-9N/node_20/avgNetProfitGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../1GB-9N/node_20/avgNetProfitGroupedBySoldDate/plots/CPU - System nie może odnaleźć określonej ścieżki.
Error: ./../1GB-9N/node_20/avgNetProfitGroupedBySoldDate/plots/RAM - System nie może odnaleźć określonej ścieżki.


StatisticsError: mean requires at least one data point

### Preprocessing spark internal metrics

Stage and task metrics

In [119]:
stage_metrics_all = pd.read_csv(f"./../{working_directory}/stage_metrics.csv")
task_metrics_all = pd.read_csv(f"./../{working_directory}/task_metrics.csv")

stage_metrics = stage_metrics_all[["function_name", "stage_id", "num_tasks", "executor_run_time", "result_size"]].copy()
stage_metrics["stage_time"]= stage_metrics_all["completion_time"] - stage_metrics_all["submission_time"]
stage_metrics = stage_metrics.groupby(["function_name", "stage_id"]).agg({
    "stage_time": "mean",
    "num_tasks": "max",
    "executor_run_time": "mean",
    "result_size": "mean"
})

task_metrics = task_metrics_all[["function_name", "stage_id", "task_type"]]
task_metrics = task_metrics.groupby(["function_name", "stage_id"]).agg({"task_type": "unique"})

spark_metrics = task_metrics.join(stage_metrics).reset_index().rename(columns={
    "stage_time": "mean_stage_time",
    "task_type": "task_types",
    "executor_run_time": "mean_executor_run_time",
    "result_size": "mean_result_size"
})

spark_metrics.to_csv(f"./../{working_directory}/preprocessed-data/spark_metrics.csv", index=False)
spark_metrics

Unnamed: 0,function_name,stage_id,task_types,mean_stage_time,num_tasks,mean_executor_run_time,mean_result_size
0,filterCatalogSalesWhereProfitNegative,0,[ResultTask],8648.6,1,5320.68,1474.0
1,filterCatalogSalesWhereProfitNegativeAndYearAf...,0,[ShuffleMapTask],4451.92,1,1138.28,2046.32
2,filterCatalogSalesWhereProfitNegativeAndYearAf...,1,[ShuffleMapTask],9042.4,1,5856.4,2032.36
3,filterCatalogSalesWhereProfitNegativeAndYearAf...,2,[ResultTask],4889.16,200,2714.2,820732.44
4,filterCatalogSalesWhereYearAfter2000,0,[ShuffleMapTask],5665.64,1,3988.4,1995.8
5,filterCatalogSalesWhereYearAfter2000,1,[ShuffleMapTask],4416.72,1,1827.52,2019.08
6,filterCatalogSalesWhereYearAfter2000,2,[ResultTask],7334.56,200,3117.16,808346.2
7,minWholeSaleCostGroupedBySoldDate,0,[ShuffleMapTask],73356.904762,1,69276.612245,2117.789116
8,minWholeSaleCostGroupedBySoldDate,1,[ResultTask],622.564626,200,475.836735,0.0


### Clear preprocessed data

In [42]:
for function_name in function_names:
    for node_dir in nodes_directories:
        try:
            shutil.rmtree(f"./../{working_directory}{node_dir}{function_name}/plots")
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

try:
    shutil.rmtree(f"./../{working_directory}/preprocessed-data")
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))


Error: ./../experiments_data/node_20/avgNetProfitGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_20/avgNetProfitGroupedBySoldDateWhereProfitNegative/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_11/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_12/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_13/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_14/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_15/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżki.
Error: ./../experiments_data/node_16/avgWholeSaleCostGroupedBySoldDate/plots - System nie może odnaleźć określonej ścieżk

Clear smoothed data

In [32]:
for function_name in function_names:
    try:
        os.remove(f'./../experiments_data/preprocessed-data/workers-mean-data{function_name}/smooth_mean_CPU.png')
        os.remove(f'./../experiments_data/preprocessed-data/workers-mean-data{function_name}/smooth_mean_data.csv')
        os.remove(f'./../experiments_data/preprocessed-data/workers-mean-data{function_name}/smooth_mean_RAM.png')
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))