In [111]:
import pandas as pd
# import modin.pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from plotly.graph_objs.layout import YAxis,XAxis
import pathlib
import numpy as np
import pyarrow as pa
import h5py
import sqlite3
import glob
import os
import re
from tqdm import tqdm
tqdm.pandas()

CUPTI_ACTIVITY_KIND_MEMCPY="CUPTI_ACTIVITY_KIND_MEMCPY"
CUPTI_ACTIVITY_KIND_MEMSET="CUPTI_ACTIVITY_KIND_MEMSET"
CUPTI_ACTIVITY_KIND_KERNEL="CUPTI_ACTIVITY_KIND_KERNEL"
# CUPTI_ACTIVITY_KIND_KERNEL (
#      start                       INTEGER   NOT NULL,                    -- Event start timestamp (ns).
#      end                         INTEGER   NOT NULL,                    -- Event end timestamp (ns).
#      deviceId                    INTEGER   NOT NULL,                    -- Device ID.
#      contextId                   INTEGER   NOT NULL,                    -- Context ID.
#      streamId                    INTEGER   NOT NULL,                    -- Stream ID.
#      correlationId               INTEGER,                               -- REFERENCES CUPTI_ACTIVITY_KIND_RUNTIME(correlationId)
#      globalPid                   INTEGER,                               -- Serialized GlobalId.
#      demangledName               INTEGER   NOT NULL,                    -- REFERENCES StringIds(id) -- Kernel function name w/ templates
#      shortName                   INTEGER   NOT NULL,                    -- REFERENCES StringIds(id) -- Base kernel function name
#      mangledName                 INTEGER,                               -- REFERENCES StringIds(id) -- Raw C++ mangled kernel function name
#      launchType                  INTEGER,                               -- REFERENCES ENUM_CUDA_KERNEL_LAUNCH_TYPE(id)
#      cacheConfig                 INTEGER,                               -- REFERENCES ENUM_CUDA_FUNC_CACHE_CONFIG(id)
#      registersPerThread          INTEGER   NOT NULL,                    -- Number of registers required for each thread executing the kernel.
#      gridX                       INTEGER   NOT NULL,                    -- X-dimension grid size.
#      gridY                       INTEGER   NOT NULL,                    -- Y-dimension grid size.
#      gridZ                       INTEGER   NOT NULL,                    -- Z-dimension grid size.
#      blockX                      INTEGER   NOT NULL,                    -- X-dimension block size.
#      blockY                      INTEGER   NOT NULL,                    -- Y-dimension block size.
#      blockZ                      INTEGER   NOT NULL,                    -- Z-dimension block size.
#      staticSharedMemory          INTEGER   NOT NULL,                    -- Static shared memory allocated for the kernel (B).
#      dynamicSharedMemory         INTEGER   NOT NULL,                    -- Dynamic shared memory reserved for the kernel (B).
#      localMemoryPerThread        INTEGER   NOT NULL,                    -- Amount of local memory reserved for each thread (B).
#      localMemoryTotal            INTEGER   NOT NULL,                    -- Total amount of local memory reserved for the kernel (B).
#      gridId                      INTEGER   NOT NULL,                    -- Unique grid ID of the kernel assigned at runtime.
#      sharedMemoryExecuted        INTEGER,                               -- Shared memory size set by the driver.
#      graphNodeId                 INTEGER,                               -- REFERENCES CUDA_GRAPH_NODE_EVENTS(graphNodeId)
#      sharedMemoryLimitConfig     INTEGER                                -- REFERENCES ENUM_CUDA_SHARED_MEM_LIMIT_CONFIG(id)
#  );
CUPTI_ACTIVITY_KIND_SYNCHRONIZATION="CUPTI_ACTIVITY_KIND_SYNCHRONIZATION"
CUPTI_ACTIVITY_KIND_CUDA_EVENT="CUPTI_ACTIVITY_KIND_CUDA_EVENT"
CUPTI_ACTIVITY_KIND_GRAPH_TRACE="CUPTI_ACTIVITY_KIND_GRAPH_TRACE"
CUPTI_ACTIVITY_KIND_RUNTIME="CUPTI_ACTIVITY_KIND_RUNTIME"
TARGET_INFO_GPU_METRICS="TARGET_INFO_GPU_METRICS"
GPU_METRICS="GPU_METRICS"
# GPU_METRICS (
#      -- GPU Metrics, events and values.
#      timestamp                   INTEGER,                               -- Event timestamp (ns).
#      typeId                      INTEGER   NOT NULL,                    -- REFERENCES TARGET_INFO_GPU_METRICS(typeId) and GENERIC_EVENT_TYPES(typeId)
#      metricId                    INTEGER   NOT NULL,                    -- REFERENCES TARGET_INFO_GPU_METRICS(metricId)
#      value                       INTEGER   NOT NULL                     -- Counter data value
#  );

STRING_IDS="StringIds"
# StringIds (
#      -- Consolidation of repetitive string values.
#      id                          INTEGER   NOT NULL   PRIMARY KEY,      -- ID reference value.
#      value                       TEXT      NOT NULL                     -- String value.
#  );


METRIC_ID_PCIE_TX=0
METRIC_ID_PCIE_RX=1
METRIC_GPC_CLOCK_FREQUENCY=9
METRIC_SYS_CLOCK_FREQUENCY=10
METRIC_GR_ACTIVE=11
METRIC_SM_ACTIVE=12
METRIC_SM_ISSUE=13
METRIC_COMPUTE_WARPS=17
METRIC_UNALLOCATED_WARPS=18
METRIC_DRAM_READ=19
METRIC_DRAM_WRITE=20


COMPRESSION_KERNELS = ['snap_kernel', 'lz_compression_kernel', 'lz4CompressBatchKernel', 'compress_kernel']
DECOMPRESSION_KERNELS = ['unsnap_kernel', 'decompression_kernel', 'lz4DecompressBatchKernel', 'decompress_kernel']

CATEGORY_COMPRESSION='compress'
CATEGORY_DECOMPRESSION='decompress'
CATEGORY_OTHER='other'

In [112]:
# Make df from sqlite
input_file_sqlite = "../data/gohan/lzbench/profile/nvcomp_lz4_nsys_results_2024-03-19-121204.sqlite"

# conn = sqlite3.connect(input_file_sqlite)

# gpu_metric_df = pd.read_sql("SELECT * FROM " + GPU_METRICS, conn)


In [113]:
input_file_arrow = "../data/gohan/lzbench/profile/nvcomp_cascaded_nsys_results_2024-03-19-121204.arrows" 


In [114]:
# Make df from hdf/h5
# input_file_hdf = "../../../data/decompression/nvcomp_results_2024-03-21-123552/output_zstd_8gb_dickens_131072threads.h5"
input_file_hdf = "../../../data/decompression/nvcomp_results_2024-03-21-123552/output_snappy_8gb_mozilla_8threads.h5"
# input_file_hdf = "../data/gohan/lzbench/profile/nvcomp_cascaded_nsys_results_2024-03-19-121204.h5"
gpu_metric_df = pd.DataFrame(np.array(h5py.File(input_file_hdf)[GPU_METRICS]))
kernel_event_df = pd.DataFrame(np.array(h5py.File(input_file_hdf)[CUPTI_ACTIVITY_KIND_KERNEL]))
strings_df = pd.DataFrame(np.array(h5py.File(input_file_hdf)[STRING_IDS]))
                                             
gpu_metric_df = gpu_metric_df.sort_values(by=['timestamp'])
kernel_event_df = kernel_event_df.sort_values(by=['start'])


In [115]:
def human_format(num):
    magnitude = 0
    while abs(num) >= 1024:
        magnitude += 1
        num /= 1024
    return '{}{}B'.format('{}'.format(num).rstrip('0').rstrip('.'), ['', 'k', 'M', 'G', 'T'][magnitude])

filesize = (8 * 1024 * 1024 * 1024)
results_df = pd.read_csv("../../../data/decompression/nvcomp_results_2024-03-21-123552/results.csv")
results_df['chunksize_Bytes'] = filesize / results_df['chunks']
results_df['chunksize_Bytes_formatted'] = results_df['chunksize_Bytes'].apply(human_format)
results_df['chunks_formatted'] = results_df['chunks'].apply(human_format)
results_df['chunks_combined_formatted'] = results_df['chunks'].apply(lambda x : f"{human_format(x)} chunks of {human_format(filesize / x)}B")

In [116]:
def get_string(id) -> str:
    return str(strings_df.loc[strings_df['id'] == id, 'value'].values[0], encoding='utf-8')

def get_utilizations_of_span(begin, end):
    # Select utilization data
    compute_metrics = gpu_metric_df.loc[gpu_metric_df['metricId'] == METRIC_COMPUTE_WARPS]
    return compute_metrics[compute_metrics['timestamp'].between(begin, end, inclusive="both")]

def get_avg_utilization_of_span(begin, end) -> float:
    utilizations = get_utilizations_of_span(begin, end)
    if len(utilizations) < 1:
        # print('Waning: No utilization data found in provided range')
        return -1
    if len(utilizations) == 1:
        return utilizations['value'].iloc[0]
    # Get the time difference between all metric timestamps
    utilizations['metric_diff'] = utilizations['timestamp'].diff()
    # Remove broken data
    utilizations = utilizations.dropna(subset=['metric_diff', 'value'])
    # Total time difference between all metric points in the span
    time_diff_sum = utilizations['metric_diff'].sum()
    if time_diff_sum == 0:
        # print("No time difference between metrics")
        return -1
    # Weighted average of each metric and their duration
    utilization = (utilizations['metric_diff'] * utilizations['value']).sum() / time_diff_sum
    
    return utilization

# print(str(get_utilizations_of_span(2097867769.0, 2097917433.0)))
# print(str(get_avg_utilization_of_span(2098191128.0, 2105510856.0)))
kernel_event_df['utilization'] = kernel_event_df.progress_apply(lambda row: get_avg_utilization_of_span(row['start'], row['end']), axis=1)
kernel_event_df['name'] = kernel_event_df['shortName'].progress_apply(get_string)
file_size = 8589934592
kernel_event_df['chunk_size'] = (file_size / kernel_event_df['gridX']).astype(int)
kernel_event_df['duration'] = kernel_event_df['end'] - kernel_event_df['start']

100%|██████████| 10/10 [00:01<00:00,  6.72it/s]
100%|██████████| 10/10 [00:00<00:00, 8598.41it/s]


In [117]:
compute_metrics = gpu_metric_df.loc[gpu_metric_df['metricId'] == METRIC_COMPUTE_WARPS]


fig1 = px.scatter(compute_metrics.sample(frac=0.01), x="timestamp", y="value")

fig1.update_layout(title='Utilization over time',
                  xaxis_title='Timestamp (ns)',
                  yaxis_title='Compute in flight')

fig1.update_traces(marker=dict(size=2))
fig1.show()


In [118]:
# Get timestamps of start and end of compression
kernel_events = kernel_event_df

kernel_events = kernel_events[kernel_events['utilization'] != -1]



fig2 = px.scatter(kernel_event_df, y="utilization", x="gridX", color="name", size="duration", log_x=True, size_max=40)
fig2.update_layout(title='Utilization per grid size',
                  xaxis_title='gridX',
                  yaxis_title='Average utilization')
fig2.update_traces(marker_sizemin=4, selector=dict(type='scatter'))

fig2.update_layout(scattermode="group")
fig2.show()



In [119]:
# gridX = filesize / chunk size
# chunk size = filesize / gridX

fig = px.scatter(kernel_event_df, y="utilization", x="chunk_size", color="name", size="duration", log_x=True, size_max=40)
fig.update_traces(marker_sizemin=4, selector=dict(type='scatter'))
fig.update_layout(scattermode="group")
fig.show()


In [122]:
# Collect all kernels that work together to do 1 (de)compression for a file at a chunk size
time_splitter_ns = 5_000_000

def weighted_mean(df):
    return (df['utilization'] * df['duration']).sum() / df['duration'].sum()

# Calculate time between two kernel events
kernel_event_df['time_since_previous'] = kernel_event_df['start'] - kernel_event_df['end'].shift(1)
kernel_event_df['group'] = kernel_event_df['time_since_previous'].gt(time_splitter_ns).cumsum()


def categorize_group(group):
    # any(string in COMPRESSION_KERNELS for string in group['name'].values)
    if any(string in COMPRESSION_KERNELS for string in group['name'].values):
        return CATEGORY_COMPRESSION
    elif any(string in DECOMPRESSION_KERNELS for string in group['name'].values):
        return CATEGORY_DECOMPRESSION
    else:
        return CATEGORY_OTHER


# Apply the function to each group
groups_durations = kernel_event_df.groupby('group')['duration'].sum().rename('duration').reset_index()
groups_categories = kernel_event_df.groupby('group').apply(categorize_group).rename('category')
groups_utilizations = kernel_event_df.groupby('group').apply(weighted_mean).rename('utilization')
groups_num_kernels = kernel_event_df.groupby('group').size().rename('num_kernels')

group_data = pd.concat([groups_categories, groups_durations, groups_utilizations, groups_num_kernels], axis=1)
print(group_data)


# print(grouped_data)
fig4 = px.bar(group_data, x=['category', 'group'], y='utilization', title='Weighted Average Values per Group', text='category')
fig4.show()



     category  group      duration  utilization  num_kernels
0       other      0      15262243    40.677781            1
1       other      1      14508485    40.815689            1
2    compress      2  106890760416     1.000000            1
3       other      3      12381903    40.950000            1
4  decompress      4   36417336311     1.000000            1
5       other      5      15273835    40.760334            1
6    compress      6  106950778942     1.005399            2
7       other      7      12383025    40.975010            1
8  decompress      8   36412059269     1.000000            1








In [121]:
# TODO: Split into compression utilization and decompression utilization
# TODO: Combine utilization data with throughput data

group_data_compress = group_data[group_data['category'] != CATEGORY_OTHER]

group_data_compress = group_data_compress.groupby('category').agg({'duration': 'mean',
                                                                   'utilization': 'mean',})

print(group_data_compress)

                duration  utilization
category                             
compress    1.069208e+11       1.0027
decompress  3.641470e+10       1.0000
