In [None]:
import duckdb
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import sys

directory_path = os.path.abspath(os.path.join('../utils/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
from functions import *

# Experiment parameters, need to be set before running this notebook.
EXPERIMENT_ID = ""
EXPERIMENT_START_TIME = ""
EVENT_IDS = [""]

# Path to the storage folder where the experiment table(s) where stored.
CLUSTER_NAME = ""
# Storage type that metrics should be extracted from, currently supported is 'AZURE'
CLUSTER_TYPE = ""
# Determines which metric to display, currently supported are 'cpu', 'gb_read', and 'gb_written'
METRIC = ""

In [None]:
# --- Create DB connection --- #
con = duckdb.connect(database=DUCKDB_PATH, read_only=True)

In [None]:
# --- Data manipulations --- #

# Retrieve relevant data from duckdb.
EXP_DATA = retrieve_experiment_df(con, EXPERIMENT_ID, EXPERIMENT_START_TIME)

# Remove superfluous entries from dataframe.
EXP_DATA = filterByEventIds(EXP_DATA, EVENT_IDS)

In [None]:
# --- Cluster metrics extraction --- #
cluster_metrics = get_cluster_metrics(CLUSTER_TYPE, EXP_DATA, CLUSTER_NAME)
cluster_metrics.fetch_metrics()

In [None]:
# --- Plot the data --- #
sns.set(rc={'figure.figsize':(20,6)})
sns.set(font_scale=2)
plt.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)

sns.lineplot(x = "TimeGenerated", y=METRIC, hue='event_id', data=cluster_metrics.get_df())
plt.legend(loc='upper left')
plt.ylabel(cluster_metrics.get_label(METRIC))

# Adjust y-axis when showing CPU utilization.
if (METRIC=="cpu"):
    ax = plt.gca()
    ax.set_ylim([0, 100])