In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import numpy as np
import pyarrow as pa
import nsys_compression


input_dir = '../data/compare_gpu/a100/'

compressors = ['snappy', 'zstd', 'gdeflate', 'lz4', 'cascaded', 'bitcomp', 'deflate', 'ans']
compression_files = ['500mb_dickens', '500mb_mozilla', '500mb_mr', '500mb_nci', '500mb_samba', '500mb_sao', '500mb_xml']
# compression_files = ['1gb_dickens', '1gb_mozilla', '1gb_mr', '1gb_nci', '1gb_samba', '1gb_sao', '1gb_silesia.zip', '1gb_xml', '1gb_lineitem.parquet']
approx_number_of_threads = [8, 32, 128, 512, 2048, 8192, 32768, 131072, 524288, 2097152, 8388608]

# FILE_SIZE = 1073741824
FILE_SIZE=524288000


In [2]:
# Generate dataframe with utilization data

result_df = nsys_compression.get_utilisation_df(input_dir, FILE_SIZE, compression_files, compressors, approx_number_of_threads)


print(result_df)


500mb_dickens  snappy  8  1.0  1.0
500mb_dickens  snappy  32  1.0  1.0
500mb_dickens  snappy  128  1.0  1.0
500mb_dickens  snappy  512  1.0  1.0
500mb_dickens  snappy  2048  1.0  1.0
500mb_dickens  snappy  8192  4.6598186471508445  4.592107155622175
500mb_dickens  snappy  32768  16.446449571398187  15.40027323174332
500mb_dickens  snappy  131072  30.534915522398933  49.53148608637113
Failed getting utilization of snappy 500mb_dickens 524288 threads: [Errno 2] Unable to synchronously open file (unable to open file: name = '../data/compare_gpu/a100/output_snappy_500mb_dickens_524288threads.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
Failed getting utilization of snappy 500mb_dickens 2097152 threads: [Errno 2] Unable to synchronously open file (unable to open file: name = '../data/compare_gpu/a100/output_snappy_500mb_dickens_2097152threads.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
Failed getting utilizatio

In [3]:
fig_utilization_standard = px.line(
    result_df,
    title=f"GPU utilization per compressor at different chunk sizes", 
    x="chunk_size",
    y="compression_utilization",
    color="file",
    log_x=True,
    markers=True,
    #log_y=True,
    facet_row="standard",
    # facet_col_spacing=0.05,
    height=1000,
    width=800,
)
# fig_utilization_standard.update_xaxes(autorange="reversed")


fig_utilization_standard.show()

In [4]:
fig_utilization_standard = px.line(
    result_df,
    title=f"GPU utilization per decompressor at different chunk sizes", 
    x="chunk_size",
    y="decompression_utilization",
    color="file",
    log_x=True,
    markers=True,
    #log_y=True,
    facet_row="standard",
    # facet_col_spacing=0.05,
    height=1000,
    width=800,
)
# fig_utilization_standard.update_xaxes(autorange="reversed")


fig_utilization_standard.show()

In [5]:
fig_utilization_file = px.line(
    result_df,
    title=f"GPU utilization during compression per file at different chunk sizes", 
    x="chunk_size",
    y="compression_utilization",
    color="standard",
    log_x=True,
    markers=True,
    #log_y=True,
    facet_row="file",
    # facet_row_spacing=0.05,
    height=1200,
    width=800,
)
# fig_utilization_standard.update_xaxes(autorange="reversed")


fig_utilization_file.show()

In [6]:
fig_utilization_file = px.line(
    result_df,
    title=f"GPU utilization during decompression per file  at different chunk sizes", 
    x="chunk_size",
    y="decompression_utilization",
    color="standard",
    log_x=True,
    markers=True,
    #log_y=True,
    facet_row="file",
    # facet_row_spacing=0.05,
    height=1200,
    width=800,
)
# fig_utilization_standard.update_xaxes(autorange="reversed")


fig_utilization_file.show()