In [1]:
import re
import os
import glob
import sys
import pandas as pd
from collections import defaultdict

In [2]:
recorder_path = "../../recorder-claix-2023/install"
if not os.path.exists(recorder_path):
    raise RuntimeError(f"{recorder_path} does not exist")

os.environ['RECORDER_INSTALL_PATH'] = recorder_path

In [3]:
# Hotfix for importing source code
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(project_root, "data/recorder")
sys.path.append(os.path.join(project_root, "src"))

In [4]:
from recorder_trace_parser.custom_types import IOMod, IOPradigm
from recorder_trace_parser.TraceParser import TraceParser

In [5]:
trace_dir = '../../NPB-mpi-recorder'
traces = glob.glob(os.path.join(trace_dir, '*'))

In [6]:
traces

['../../NPB-mpi-recorder/bt.B.16.mpi_io_full',
 '../../NPB-mpi-recorder/bt.B.9.mpi_io_full',
 '../../NPB-mpi-recorder/bt.B.1.mpi_io_full',
 '../../NPB-mpi-recorder/bt.B.4.mpi_io_full',
 '../../NPB-mpi-recorder/bt.C.9.mpi_io_full',
 '../../NPB-mpi-recorder/bt.C.1.mpi_io_full',
 '../../NPB-mpi-recorder/bt.C.4.mpi_io_full',
 '../../NPB-mpi-recorder/bt.A.4.mpi_io_full',
 '../../NPB-mpi-recorder/bt.C.25.mpi_io_full',
 '../../NPB-mpi-recorder/bt.A.16.mpi_io_full',
 '../../NPB-mpi-recorder/bt.A.9.mpi_io_full',
 '../../NPB-mpi-recorder/bt.A.25.mpi_io_full',
 '../../NPB-mpi-recorder/bt.B.25.mpi_io_full',
 '../../NPB-mpi-recorder/bt.A.1.mpi_io_full',
 '../../NPB-mpi-recorder/bt.C.16.mpi_io_full']

In [7]:
pradigm_mod_combs = [
    (IOPradigm.POSIX, IOMod.READ),
    (IOPradigm.POSIX, IOMod.WRITE),
    (IOPradigm.MPIIO, IOMod.READ),
    (IOPradigm.MPIIO, IOMod.WRITE),
]

In [8]:
agg_data = defaultdict(dict)
for trace in traces:
    parsed_trace = TraceParser(trace)
    elapsed_time = parsed_trace.aggregate_op_stat(lambda op: op.duration)
    
    # Use name part bt.X.Y of trace as key
    k = re.search(r"(bt\.[A-Z]\.\d{1,2})(?:\.mpi_io_full)?", trace).group(1)

    for mod_typ in pradigm_mod_combs:
        # Swap the tuple order to match elapsed_time's keys
        elapsed_key = (mod_typ[1], mod_typ[0])
        if elapsed_key in elapsed_time:
            agg_data[mod_typ].update({
                f"{k}_time": pd.Series(elapsed_time[elapsed_key]),
            })

Rank: 0, intercepted calls: 17983, accessed files: 5
Rank: 1, intercepted calls: 17257, accessed files: 2
Rank: 2, intercepted calls: 17257, accessed files: 2
Rank: 3, intercepted calls: 17257, accessed files: 2
Rank: 4, intercepted calls: 17257, accessed files: 2
Rank: 5, intercepted calls: 17257, accessed files: 2
Rank: 6, intercepted calls: 17254, accessed files: 1
Rank: 7, intercepted calls: 17257, accessed files: 2
Rank: 8, intercepted calls: 17257, accessed files: 2
Rank: 9, intercepted calls: 17257, accessed files: 2
Rank: 10, intercepted calls: 17257, accessed files: 2
Rank: 11, intercepted calls: 17257, accessed files: 2
Rank: 12, intercepted calls: 17257, accessed files: 2
Rank: 13, intercepted calls: 17257, accessed files: 2
Rank: 14, intercepted calls: 17257, accessed files: 2
Rank: 15, intercepted calls: 17257, accessed files: 2
Rank: 0, intercepted calls: 13159, accessed files: 5
Rank: 1, intercepted calls: 12433, accessed files: 2
Rank: 2, intercepted calls: 12433, acces

In [9]:
# map to csv file names
csv_file = {
    (IOPradigm.MPIIO, IOMod.READ): "MPI_READ",
    (IOPradigm.MPIIO, IOMod.WRITE): "MPI_WRITE",
    (IOPradigm.POSIX, IOMod.READ): "POSIX_READ",
    (IOPradigm.POSIX, IOMod.WRITE): "POSIX_WRITE"
}

In [11]:
for k, v in agg_data.items():
    csv_out = os.path.join(data_dir, f"{csv_file[k]}.csv")
    pd.DataFrame(v).to_csv(csv_out)