- It seems like only `mpi_io_full` benchmarks with DXT have both POSIX and MPIIO function calls.
- Normal ones like `bt.A.16` don't have this feature.

In [2]:
# specify the directory
directory = './darshan-results'

# list all folders in the directory
folders = [f for f in os.listdir(directory) if os.path.isdir(os.path.join(directory, f))]
print(folders)
mpi_io_full_folders = [folder for folder in folders if 'mpi_io_full' in folder]
print(mpi_io_full_folders)

['ph077533_bt.C.1.mpi_io_full_id215749-215749_1-11-51771-17387797071829052971_1', 'ph077533_bt.C.25_id119766-119766_1-12-39692-16245044248240981643_1', 'ph077533_bt.B.25.mpi_io_full_id45335-45335_1-11-32932-1191024549193276201_1', 'ph077533_bt.B.1.mpi_io_full_id146641-146641_1-11-32941-7126310626660110613_1', 'ph077533_bt.A.25.mpi_io_full_id146213-146213_1-11-32943-18106118645460959180_1', 'ph077533_bt.C.9.mpi_io_full_id172832-172832_1-11-51771-6052134348059707871_1', 'ph077533_bt.C.25.mpi_io_full_id227757-227757_1-11-51772-17062698119136381187_1', 'ph077533_bt.B.1_id54424-54424_1-12-39699-11408100436758256178_1', 'ph077533_bt.C.4_id174069-174069_1-12-39708-11776689627096302925_1', 'ph077533_bt.B.4.mpi_io_full_id224930-224930_1-11-33232-7884968234836009012_1', 'ph077533_bt.A.1_id225269-225269_1-12-39699-194225757733429485_1', 'ph077533_bt.A.4_id264367-264367_1-12-39708-3950305670195019376_1', 'ph077533_bt.C.16_id223695-223695_1-12-39709-10948421854399242731_1', 'ph077533_bt.B.9.mpi_io_

# Parse Exported Vampir Results to Dictionary
- The CSV files are not in a consistent format, so they are parsed into dictionaries rather than a pandas DataFrame.
- The CSV files are exported through Vampir for accumulated time per function.

In [3]:
import csv

def parse_csv_to_dict(file_path):
    # Dictionary to store key-value pairs
    results = {}

    # Open the file and parse it
    with open(file_path, "r") as file:
        reader = csv.reader(file, delimiter=";")
        for row in reader:
            if len(row) == 2:  # Check if the row has exactly two columns
                key, value = row
                # Remove quotes and trim whitespace
                key = key.strip('"').strip()
                value = value.strip('"').strip()
                results[key] = value
    return results

# Example usage
file_path = os.path.join(directory, mpi_io_full_folders[1], 'Function_Summary_traces.csv')
results = parse_csv_to_dict(file_path)
print(results)


{'ClkPeriod/sec': '1000000000', 'StartTime[Clk]': '0', 'StopTime[Clk]': '10994643450', 'Function': 'Accumulated Exclusive Time', '': '[s]', 'mpi_read': '73.1135 s', 'mpi_write': '35.7373 s', 'posix_write': '0.88392 s', 'posix_read': '0.443711 s'}


In [4]:
import re

accumulated_data = []
for folder_name in mpi_io_full_folders:
    # shorten the directory name for better readability
    pattern = r'bt\.[A-C]\.\d{1,2}(\.mpi_io_full)?'
    shortened_name = re.search(pattern, folder_name).group(0)
    file_path = os.path.join(directory, folder_name, 'Function_Summary_traces.csv')
    paresed_data = parse_csv_to_dict(file_path)
    paresed_data['shortened_name'] = shortened_name
    accumulated_data.append(paresed_data)

In [17]:
io_data_list = list(
    map(
        lambda x: {key: x[key] for key in ['shortened_name', 'mpi_read', 'mpi_write', 'posix_write', 'posix_read']}, 
        accumulated_data
    )
)

In [18]:
# Extract numerical parts from shortened_name and sort based on them
df = pd.DataFrame(io_data_list)
df['sort_key'] = df['shortened_name'].apply(lambda x: [int(y) if y.isdigit() else y for y in re.split(r'(\d+)', x)])
df = df.sort_values(by='sort_key').drop(columns='sort_key')


In [19]:
df

Unnamed: 0,shortened_name,mpi_read,mpi_write,posix_write,posix_read
13,bt.A.1.mpi_io_full,0.0492167 s,0.163944 s,0.177525 s,0.116641 s
9,bt.A.4.mpi_io_full,0.951383 s,1.5034 s,0.189187 s,0.188815 s
14,bt.A.9.mpi_io_full,1.04537 s,3.03988 s,0.189409 s,0.0531504 s
10,bt.A.16.mpi_io_full,6.93704 s,5.65494 s,0.183206 s,0.379247 s
3,bt.A.25.mpi_io_full,5.24276 s,11.0625 s,0.254929 s,0.152406 s
2,bt.B.1.mpi_io_full,0.264503 s,0.60103 s,0.919374 s,0.122316 s
6,bt.B.4.mpi_io_full,3.71287 s,5.9661 s,1.03425 s,0.532271 s
7,bt.B.9.mpi_io_full,8.34542 s,11.8921 s,0.89259 s,0.596882 s
11,bt.B.16.mpi_io_full,12.2189 s,21.9259 s,0.895495 s,0.458824 s
1,bt.B.25.mpi_io_full,73.1135 s,35.7373 s,0.88392 s,0.443711 s


In [20]:
df.to_csv('darshan_results.csv', index=False)