In [None]:
import os
import re
import sys
import pandas as pd
from collections import defaultdict

Hotfix: set path to use packages in current project

In [52]:
project_root = os.path.join(os.getcwd(), "trace_parser")
data_dir = os.path.join(project_root, "data")
sys.path.append(os.path.join(project_root, "src"))

In [60]:
from darshan_trace_parser.TraceParser import TraceParser
from darshan_trace_parser.custom_types import IOModule, IOModule, IOType

In [54]:
trace_dir = "./NPB-mpi-darshan"
traces = os.listdir(trace_dir)
traces = [os.path.join(trace_dir, trace) for trace in traces]

In [55]:
traces

['./NPB-mpi-darshan/bt.B.1.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.A.16.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.C.16.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.B.25.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.B.9.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.C.1.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.C.25.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.A.9.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.A.4.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.C.4.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.B.4.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.C.9.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.A.1.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.B.16.mpi_io_full.darshan',
 './NPB-mpi-darshan/bt.A.25.mpi_io_full.darshan']

In [56]:
mod_typ_combs = [
    (IOModule.DXT_MPIIO, IOType.READ),
    (IOModule.DXT_MPIIO, IOType.WRITE),
    (IOModule.DXT_POSIX, IOType.READ),
    (IOModule.DXT_POSIX, IOType.WRITE)
]

In [57]:
agg_data = defaultdict(dict)
for trace in traces:
    parsed_trace = TraceParser(trace)
    elapsed_time = parsed_trace.aggregate_op_stat(lambda op: op.duration())
    data_size  = parsed_trace.aggregate_op_stat(lambda op: op.length)
    bandwidth = parsed_trace.aggregate_op_stat(lambda op: op.bandwidth())
    
    # Use name part bt.X.Y of trace as key
    k = re.search(r"(bt\.[A-Z]\.\d{1,2})(?:\.mpi_io_full)?", trace).group(1)

    for mod_typ in mod_typ_combs:
        agg_data[mod_typ].update({
            f"{k}_time": pd.Series(elapsed_time[mod_typ]),
            f"{k}_size": pd.Series(data_size[mod_typ]),
            f"{k}_bandwidth": pd.Series(bandwidth[mod_typ])
        })

In [58]:
# map to csv file names
csv_file = {
    (IOModule.DXT_MPIIO, IOType.READ): "MPI_READ",
    (IOModule.DXT_MPIIO, IOType.WRITE): "MPI_WRITE",
    (IOModule.DXT_POSIX, IOType.READ): "POSIX_READ",
    (IOModule.DXT_POSIX, IOType.WRITE): "POSIX_WRITE"
}

In [59]:
for k, v in agg_data.items():
    csv_out = os.path.join(data_dir, f"{csv_file[k]}.csv")
    pd.DataFrame(v).to_csv(csv_out)