In [1]:
import glob
import os
import pandas as pd

In [2]:
scorep_result_dir = './scorep-results'

# get all files in the directory
scorep_result_dir = './scorep-results'
scorep_result_dir = glob.glob(os.path.join(scorep_result_dir, '*'))

In [3]:
otf2_trace = "./scorep-results/bt.C.4.mpi_io_full" + '/traces.otf2'
print(otf2_trace)

./scorep-results/bt.C.4.mpi_io_full/traces.otf2


In [4]:
import otf2.reader
from otf2.events import Enter
def extract_unique_functions(trace_file_path: str) -> set:
    """
    Extract unique function names from an OTF2 trace file.
    
    Args:
        trace_file_path (str): Path to the OTF2 trace file
            Example: './path/to/traces.otf2'
    
    Returns:
        set[str]: Set of unique function names found in the trace
            Example: {'MPI_Init', 'MPI_Finalize', 'MPI_Barrier'}
    
    Raises:
        ValueError: If file path doesn't end with .otf2
        FileNotFoundError: If trace file doesn't exist
    """
    # Validate input file
    if not trace_file_path.endswith('.otf2'):
        raise ValueError("Input file must be an .otf2 trace file")
    
    if not os.path.exists(trace_file_path):
        raise FileNotFoundError(f"Trace file not found: {trace_file_path}")

    unique_functions = set()
    
    try:
        with otf2.reader.open(trace_file_path) as trace:
            for _, event in trace.events:
                if isinstance(event, Enter):
                    unique_functions.add(event.region.name)
    except Exception as e:
        raise RuntimeError(f"Error reading trace file: {str(e)}")
            
    return unique_functions

In [5]:
extract_unique_functions(otf2_trace)

{'MPI_Allreduce',
 'MPI_Barrier',
 'MPI_Bcast',
 'MPI_Comm_dup',
 'MPI_Comm_rank',
 'MPI_Comm_size',
 'MPI_File_close',
 'MPI_File_delete',
 'MPI_File_open',
 'MPI_File_read_at_all',
 'MPI_File_set_view',
 'MPI_File_write_at_all',
 'MPI_Finalize',
 'MPI_Init',
 'MPI_Irecv',
 'MPI_Isend',
 'MPI_Reduce',
 'MPI_Wait',
 'MPI_Waitall'}

In [6]:
import pandas as pd
from collections import defaultdict
from otf2.events import Enter, Leave

def calculate_accumulated_function_time(otf2_trace: str) -> pd.DataFrame:
    """
    Calculate timing metrics from an OTF2 trace file.
    
    Returns DataFrame with columns:
    - Function: Name of the function
    - Total Time (s): Total time spent in function
    - Call Count: Number of invocations
    - Average Time (s): Mean time per call
    
    Usage:
    >>> otf2_trace = "./scorep-results/bt.C.4.mpi_io_full/traces.otf2"
    >>> df = calculate_accumulated_function_time(otf2_trace)
    >>> print(df)
    
    Results:
    >>> df.head()
          Function  Total Time (s)  Call Count  Average Time (s)
    0  MPI_Init          0.123456          10          0.012346
    1  MPI_Finalize      0.234567          20          0.011728
    2  MPI_Barrier       0.345678          30          0.011523
    3  MPI_Bcast         0.456789          40          0.011420
    4  MPI_Reduce        0.567890          50          0.011358
    """
    # Input validation
    if not otf2_trace.endswith('.otf2'):
        raise ValueError("Input file must be an .otf2 trace file")
    
    # Initialize tracking dictionaries
    metrics = {
        'times': defaultdict(float),  # Track total time spent in each function
        'counts': defaultdict(int)    # Track number of calls to each function
    }
    call_stacks = defaultdict(list)
    
    # Process trace events
    with otf2.reader.open(otf2_trace) as trace:
        resolution = trace.timer_resolution
        
        for location, event in trace.events:
            if isinstance(event, Enter):
                call_stacks[location].append((event.region, event.time))
                metrics['counts'][event.region] += 1
                
            elif isinstance(event, Leave):
                if not call_stacks[location]:
                    raise RuntimeError(f"Unmatched Leave event in {location.name}")
                    
                region, start_time = call_stacks[location].pop()
                if region != event.region:
                    raise RuntimeError(f"Mismatched Enter/Leave in {location.name}")
                
                duration = (event.time - start_time) / resolution
                metrics['times'][region] += duration
    

    function_names = [fn.name for fn in metrics['times']]
    total_times = list(metrics['times'].values())
    call_counts = [metrics['counts'][fn] for fn in metrics['times']]
    avg_times = [metrics['times'][fn] / metrics['counts'][fn] for fn in metrics['times']]          
    return pd.DataFrame({
        'Function': function_names,
        'Total Time (s)': total_times,
        'Call Count': call_counts,
        'Average Time (s)': avg_times
    })


In [7]:
calculate_accumulated_function_time(otf2_trace)

Unnamed: 0,Function,Total Time (s),Call Count,Average Time (s)
0,MPI_Init,3.571143,4,0.8927858
1,MPI_Comm_size,7e-06,4,1.679048e-06
2,MPI_Comm_rank,2e-06,4,4.985714e-07
3,MPI_Comm_dup,0.002375,8,0.0002969209
4,MPI_Bcast,0.003023,32,9.445419e-05
5,MPI_File_delete,0.035998,1,0.03599807
6,MPI_Barrier,0.112276,12,0.009356348
7,MPI_File_open,0.233338,8,0.02916728
8,MPI_File_set_view,0.002926,8,0.0003657445
9,MPI_Irecv,0.013122,9672,1.356701e-06


In [8]:
def beautify_df(df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
    """
    Beautify the DataFrame by sorting and aggregating smaller values.
    
    Parameters:
    It then aggregates the rows beyond the top_n into a single row labeled 'others'.
    The resulting DataFrame is reset to have a continuous index.
    
    Parameters:
    dataframe (pd.DataFrame): The input DataFrame with at least a 'Total Time (s)' column.
    top_n (int): The number of top rows to retain before aggregating the rest. Default is 10.
    
    Returns:
    pd.DataFrame: A beautified DataFrame with the top_n rows and an aggregated 'others' row.
    
    Usage:
    >>> df = pd.DataFrame({
    >>>     'Function': ['A', 'B', 'C', 'D', 'E'],
    >>>     'Total Time (s)': [10, 9, 8, 7, 6],
    >>>     'Call Count': [100, 90, 80, 70, 60],
    >>>     'Average Time (s)': [0.1, 0.1, 0.1, 0.1, 0.1]
    >>> })
    >>> beautified_df = beautify_df(df, top_n=3)
    >>> print(beautified_df)
    
    Results:
    >>> beautified_df
      Function  Total Time (s)  Call Count  Average Time (s)
    0        A           10.00         100              0.10
    1        B            9.00          90              0.10
    2        C            8.00          80              0.10
    3   others           22.00         270              0.10
    """
    # Sort the DataFrame by 'Total Time (s)' in descending order
    sorted_df = df.sort_values(['Total Time (s)'], ascending=[False])
    
    # Aggregate rows beyond the top_n into a single row labeled 'others'
    remaining_df = sorted_df[top_n:].drop(columns='Function').agg('sum').to_frame().T
    remaining_df.insert(0, 'Function', 'others')
    
    # Concatenate top rows with the aggregated 'others' row
    result_df = pd.concat([sorted_df.head(top_n), remaining_df]).reset_index(drop=True)

    return result_df.round(4)

In [9]:
beautify_df(calculate_accumulated_function_time(otf2_trace), top_n=5)

Unnamed: 0,Function,Total Time (s),Call Count,Average Time (s)
0,MPI_File_write_at_all,20.8652,160.0,0.1304
1,MPI_File_read_at_all,12.7506,160.0,0.0797
2,MPI_Wait,10.1933,9648.0,0.0011
3,MPI_Init,3.5711,4.0,0.8928
4,MPI_Isend,0.6676,9672.0,0.0001
5,others,1.1138,10741.0,0.0769
