In [None]:
"""Analysis notebook for the Marconi HPC tests.
"""

import sys
import csv
import matplotlib
import matplotlib.pyplot as plt
from cycler import cycler
from matplotlib.cm import get_cmap

# Check Python version
pyVer = sys.version.split("\n")[0]
print(f"Hello, we are using Python {pyVer}")

# Adjust figure size for Matplotlib
matplotlib.rcParams["figure.figsize"] = [16, 10]
matplotlib.rcParams["figure.dpi"] = 200

In [None]:
def to_bool(s):
    """Convert a string to bool.
    """
    return s.lower() in [ "true", "yes", "on", "1" ]

def get_tuple(raw, fields, cond={}, cond_eval="True"):
    """Given a list of dictionaries in `raw`, returns a tuple of lists, each one of them
       representing the fields specified by `fields`.
       Results are filtered by the conditions expressed in `cond` and `condEval`.
       Example:
           a,b = getTuple(plot_raw, ("nJobs", "wallTime"), {"nEvt": 200})
       Gets two lists: a with the list of nJobs, b with the list of wallTime.
       a,b can be passed as arguments to matplotlib plot functions.
    """
    x = ( tuple(map(x.get, fields)) for x in raw if all(x[k] == v for k,v in cond.items()) and \
                                                    (eval(cond_eval)) )
    return map(list, zip(*x))

def load_csv(csv_fn, cond_eval="True"):
    """Loads from `csv_fn`, outputs a list of dictionaries.
       Performs type conversions in the process.
       Discards fields not matching `cond_eval`.
    """
    conv_default = int
    conv_map = { "success": to_bool, "shMem": to_bool, "cpuEff": float }
    sort_func = lambda x: 100000 * x.get("nEvt", 0) + \
                          1000 * x.get("nInst", 0) + \
                          x.get("nProc", 0) + x.get("nJobs", 0)
    raw = []
    with open(csv_fn) as ch:
        cr = csv.DictReader(ch)
        pk = True
        for rec in cr:
            if pk:
                print(f"Keys from {csv_fn}: {', '.join(rec.keys())}")
                pk = False
            # Convert field types
            for f in rec:
                typ = conv_map.get(f, conv_default)
                rec[f] = typ(rec[f])
            nEvt = rec["nEvt"]
            if eval(cond_eval):
                raw.append(dict(rec))
            else:
                print(f"WARNING: discarded: {dict(rec)}")

    raw.sort(key=sort_func)
    return raw

In [None]:
# Plot wall time vs. number of processes
scale_raw = load_csv("scalability_proc.csv", "rec['success']")
scale_fig,scale_ax = plt.subplots()
cycol = cycler("color", get_cmap("tab10").colors).__iter__()
for n_evt in sorted({x["nEvt"] for x in scale_raw}):
    x,y = get_tuple(scale_raw, ("nJobs", "wallTime"), {"nEvt": n_evt})
    prop = cycol.__next__()
    # Lines
    scale_ax.plot(x, y, "-", label=f"{n_evt} events", **prop)
    # Plot different dots if shMem or zmq
    x1,y1 = get_tuple(scale_raw, ("nJobs", "wallTime"), {"nEvt": n_evt, "shMem": True})   # shmem
    scale_ax.scatter(x1, y1, s=75, marker="o", **prop)
    x2,y2 = get_tuple(scale_raw, ("nJobs", "wallTime"), {"nEvt": n_evt, "shMem": False})   # zmq
    scale_ax.scatter(x2, y2, s=100, marker="x", **prop)
scale_ax.grid(True)
scale_ax.legend()
scale_ax.set_title("Wall time vs. number of processes (one instance only)")
scale_ax.set_xlabel("processes")
scale_ax.set_ylabel("wall time [s]");
del x, y, x1, y1, x2, y2, prop, cycol

In [None]:
{ x["nJobs"] for x in scale_raw }

In [None]:
# Plot wall time vs. number of instances
mult_raw = load_csv("slurm_stats.csv", "rec['nInstOk'] == rec['nInst']")
mult_fig,mult_ax = plt.subplots()
cycol = cycler("color", get_cmap("tab10").colors).__iter__()
for n_evt in sorted({x["nEvt"] for x in mult_raw}):
    x,y = get_tuple(mult_raw, ("nInst", "slurmWallTime"), {"nEvt": n_evt})
    prop = cycol.__next__()
    # Lines
    mult_ax.plot(x, y, "o-", label=f"{n_evt} events", **prop)
mult_ax.grid(True)
mult_ax.legend()
mult_ax.set_title(f"Wall time vs. number of instances ({mult_raw[0]['nProc']} processes per instance)")
mult_ax.set_xlabel("instances")
mult_ax.set_ylabel("wall time [s]");
del x, y, prop, cycol

In [None]:
# Performance data over time for every job
psmon_raw = load_csv("psmon.csv")

psmon_fig = []
psmon_ax = []
for n_evt in sorted({x["nEvt"] for x in psmon_raw}):
    # For each event set, do a different plot
    fig,ax = plt.subplots()
    psmon_fig.append(fig)
    psmon_ax.append(ax)
    cycol = cycler("color", get_cmap("tab20").colors).__iter__()
    for n_inst in sorted({x["nInst"] for x in psmon_raw if x["nEvt"] == n_evt}):
        # For each number of instances
        x,y = get_tuple(psmon_raw, ("elapsed", "cpuEff"), {"nInst": n_inst, "nEvt": n_evt})
        prop = cycol.__next__()
        # Lines
        ax.plot(x, y, "-", label=f"{n_inst} instances", **prop, marker="o", markersize=4)
    ax.grid(True)
    ax.legend()
    ax.set_title(f"Trend of CPU efficiency ({n_evt} events, {psmon_raw[0]['nProc']} processes per instance)")
    ax.set_xlabel("wall time [s]")
    ax.set_ylabel("CPU efficiency");
del ax, fig, x, y, prop, cycol, n_inst, n_evt