In [1]:
import pandas as pd   
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path
from statistics import mean

In [2]:
def get_subdirectories(directory=""):
    subdirectories = []
    p = Path("./../experiments_data/" + directory)
    for item in p.glob('*/'):
        if item.suffix not in (['.csv', '.zip']):
            subdirectories.append(directory + "/" + item.name)
    return subdirectories

def get_duration(dataframe):
    start_time = dataframe['timestamp'].max()
    stop_time = dataframe['timestamp'].min()

    start_datetime = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S.%f')
    stop_datetime = datetime.strptime(stop_time, '%Y-%m-%d %H:%M:%S.%f')
    duration = stop_datetime - start_datetime
    return duration.microseconds

def preprocess_file(file_path, filename, iteration):
    read_data = pd.read_csv(file_path + "/" + filename)
    agg_read_data = read_data.\
        groupby('timestamp', as_index=False).\
        agg({"CPU": "sum", "RAM": "sum"})

    fig = plt.figure(figsize=(20,10))
    plt.plot(agg_read_data.index, agg_read_data.CPU, 'r.-')
    plt.savefig(file_path + "/" + str(iteration) + "CPU.png")
    plt.close(fig)

    fig = plt.figure(figsize=(20,10))
    plt.plot(agg_read_data.index, agg_read_data.RAM, 'r.-')
    plt.savefig(file_path + "/" + str(iteration) + "RAM.png")
    plt.close(fig)

    duration = get_duration(agg_read_data)

    return agg_read_data, duration

In [3]:
nodes_directories = get_subdirectories()
nodes_directories

['/11_experiments_data_2021_05_05_21_37',
 '/12_experiments_data_2021_05_05_21_37',
 '/13_experiments_data_2021_05_05_21_37',
 '/20_experiments_data_2021_05_05_21_37']

In [4]:
data_directories = []
for directory in nodes_directories:
    cur_node_subdirectories = get_subdirectories(directory)
    data_directories.append(cur_node_subdirectories)

data_directories = [item for sublist in data_directories for item in sublist]
data_directories

['/11_experiments_data_2021_05_05_21_37/countDistinctTicketNumber',
 '/12_experiments_data_2021_05_05_21_37/countDistinctTicketNumber',
 '/13_experiments_data_2021_05_05_21_37/countDistinctTicketNumber',
 '/20_experiments_data_2021_05_05_21_37/countDistinctTicketNumber']

In [6]:
for experiment_directory in data_directories:
    path = "./../experiments_data" + experiment_directory
    p = Path(path)
    iteration = 1
    agg_base_data = pd.DataFrame()
    experiments_durations = []

    for file in p.glob('*.csv'):
        if iteration == 1:
            agg_base_data, duration = preprocess_file(path, file.name, iteration)
            experiments_durations.append(duration)

        else:
            agg_new_data, duration = preprocess_file(path, file.name, iteration)
            experiments_durations.append(duration)
            
            agg_base_data = pd.concat((agg_base_data, agg_new_data))

        iteration += 1

    experiment_duration_mean = mean(experiments_durations)

    agg_base_data = agg_base_data.groupby(agg_base_data.index).mean()

    mean_duration_col = [experiment_duration_mean] * agg_base_data.shape[0]
    agg_base_data['duration'] = mean_duration_col

    agg_base_data.to_csv(path + "/experiment_mean_data.csv", index=False)

    fig = plt.figure(figsize=(20,10))
    plt.plot(agg_base_data.index, agg_base_data.CPU, 'r.-')
    plt.savefig(path + "/" + "avg_CPU.png")
    plt.close(fig)

    fig = plt.figure(figsize=(20,10))
    plt.plot(agg_base_data.index, agg_base_data.RAM, 'r.-')
    plt.savefig(path + "/" + "avg_RAM.png")
    plt.close(fig)
