In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import glob

### CPU and Memory Usage

In [None]:
MAX_MEASURES = 15
DIRECTORY_PATH = "memory/"
txt_files = glob.glob(os.path.join(DIRECTORY_PATH, "*.txt"))
files = [os.path.basename(file) for file in txt_files]

main_average_dict = {} # dict of lists [memory_usage_1, ...]
""" example structure
{
    ('nasa', '100000', '15000'): {
        'json_dummy': [80.9, 77.25, 56.42],
        'json_first_list_flattener': [70.76, 76.575, 58.025, 77.8, 68.9]
    },
    ('nasa', '1000000', '12000'): {
        'json_dummy': [78.46, 90.97, 80.784, 88.621]
    }
}
"""


for filename in files:
    file_tags = filename.split("---")
    test_key = (file_tags[0], file_tags[1]) # dataset, dataset_size
    flatter_key = file_tags[2] # flatter_method

    with open(DIRECTORY_PATH + filename) as file:
        memory = list(map(int, file.read().split()))
    
    if test_key not in main_average_dict:
        main_average_dict[test_key] = {}
    if flatter_key not in main_average_dict[test_key]:
        main_average_dict[test_key][flatter_key] = []

    mem_avg = round(sum(memory)/len(memory), 3)
    main_average_dict[test_key][flatter_key].append(mem_avg)

# Get max 12 last measurements
for test_key in main_average_dict.keys():
    for key, value in main_average_dict[test_key].items():
        if len(value) > MAX_MEASURES:
            main_average_dict[test_key][key] = value[-MAX_MEASURES:]

print("Keys of main_average_dict:")
for key in main_average_dict.keys():
    print(key)


def draw_usage_plot(key: tuple):
    if key not in main_average_dict.keys():
        print(f"Key {key} not found!")
        return

    measurements = main_average_dict[key].copy()
    DUMMY_KEY = "json_dummy"
    dummy_measurement = measurements[DUMMY_KEY]

    if (dummy_measurement is None or len(dummy_measurement) < 1):
        print("No dummy measures!")
        return

    measurements.pop(DUMMY_KEY)
    df_memory_usage = pd.DataFrame()

    for method, usage in measurements.items():
        real_usage = [
            round(usage[i], 3)
            for i in range(len(usage))
        ]

        # Subtract from dummy?

        while len(real_usage) < MAX_MEASURES:
            real_usage.append(np.nan)
        df_memory_usage[method] = real_usage

    fig, ax = plt.subplots(figsize=(10, 8))
    df_memory_usage.boxplot(grid=False)
    ax.set_title("Memory usage")

    plt.tight_layout()
    plt.show()

In [None]:
# draw_usage_plot(('airlines', '1000000'))
# draw_usage_plot(('movies', '1000000'))
# draw_usage_plot(('nasa', '1000000'))

draw_usage_plot(('gists', '100000'))
draw_usage_plot(('reddit', '10000'))