In [6]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import glob
import re


multi_file = True

if multi_file:
    # interesting_files = glob.glob("../data/cuda01/silesia/*.csv")
    interesting_files = glob.glob("../data/gohan/lzbench/profile/*.csv")
    illegal_file = ".*_all.csv"
    interesting_files.sort()
else:
    input_file = "../data/gohan/lzbench/cuda_all.csv"


name = "tab20"
cmap = mpl.colormaps[name]
colors = cmap.colors

for sus_file in interesting_files:
    if re.match(illegal_file, sus_file):
        interesting_files.remove(sus_file)
 
def fix_chunk_value(chunk_data):
    """ Convert the parameter to bitshifted chunk size """
    return (1 << int(abs(chunk_data)+6))

def fix_compression_ratio(data):
    """ Convert compression percentage to compression ratio """
    return 100/data

def get_chunk_size_from_name(name):
    """ nvcomp_zstd 3.0.5 -10: return 10 """
    return int(re.findall(r'\d+', name)[-1])

def make_groupable_name(name):
    """ Make name constant per compressor """
    return name.split()[0].replace('_', ' ')

def fix_chunk_sizes(df: pd.DataFrame):
    """ Chunk sizes list for plot"""
    if not ('Chunk size' in df.columns):
        df['Chunk size'] = df['Compressor name'].apply(get_chunk_size_from_name)
    df['Chunk size'] = df['Chunk size'].apply(fix_chunk_value)
    return df

In [7]:
# Read files

all_df = pd.DataFrame()

if multi_file:
    for in_file in interesting_files:
        tmp_df = pd.read_csv(in_file)
        all_df = pd.concat([all_df, tmp_df])
else:
    all_df = pd.read_csv(input_file)


mask = all_df['Compressor name'].str.contains('emcpy')
all_df = all_df[~mask]
all_df = fix_chunk_sizes(all_df)

all_df['Compressor parsed name'] = all_df['Compressor name'].apply(make_groupable_name)
all_df['ratio parsed'] = fix_compression_ratio(all_df['Ratio'])

all_df

Unnamed: 0,Compressor name,Compression speed,Decompression speed,Original size,Compressed size,Ratio,Filename,Chunk size,Compressor parsed name,ratio parsed
1,nvcomp_ans 3.0.5 -5,4810.19,4792.34,211957760,184551816,87.07,/home/robin/dataset/silesia.tar,2048,nvcomp ans,1.148501
2,nvcomp_ans 3.0.5 -6,5326.15,5248.82,211957760,161362632,76.13,/home/robin/dataset/silesia.tar,4096,nvcomp ans,1.313543
3,nvcomp_ans 3.0.5 -7,5610.25,5498.29,211957760,149984936,70.76,/home/robin/dataset/silesia.tar,8192,nvcomp ans,1.413228
4,nvcomp_ans 3.0.5 -8,5839.57,5692.96,211957760,139864144,65.99,/home/robin/dataset/silesia.tar,16384,nvcomp ans,1.515381
5,nvcomp_ans 3.0.5 -9,5953.35,5800.60,211957760,135283908,63.83,/home/robin/dataset/silesia.tar,32768,nvcomp ans,1.566661
...,...,...,...,...,...,...,...,...,...,...
12,nvcomp_zstd 3.0.5 -14,1210.46,2749.08,211957760,70561872,33.29,/home/robin/dataset/silesia.tar,1048576,nvcomp zstd,3.003905
13,nvcomp_zstd 3.0.5 -15,765.55,1784.87,211957760,70487688,33.26,/home/robin/dataset/silesia.tar,2097152,nvcomp zstd,3.006615
14,nvcomp_zstd 3.0.5 -16,449.70,1073.83,211957760,70403190,33.22,/home/robin/dataset/silesia.tar,4194304,nvcomp zstd,3.010235
15,nvcomp_zstd 3.0.5 -17,254.61,702.72,211957760,70363095,33.20,/home/robin/dataset/silesia.tar,8388608,nvcomp zstd,3.012048


In [8]:
fig_compression_ratio = px.line(
    all_df,
    title=f"nvCOMP compression ratio per chunk size", 
    x="Chunk size",
    y="Decompression speed",
    color='Compressor parsed name',
    log_x=True,
    markers=True,
    # width=800,
    # height=650,
)
fig_compression_ratio.update_layout(
    yaxis_title="Decompression throughput (MB/s)",
    xaxis_title="Chunk size (Bytes)",
)

fig_compression_ratio.show()







In [9]:
fig_decompression_throughput = px.line(
    all_df,
    title=f"nvCOMP decompression throughput per chunk size", 
    x="Chunk size",
    y="ratio parsed",
    color='Compressor parsed name',
    log_x=True,
    markers=True,
    # width=800,
    # height=650,
)
fig_decompression_throughput.update_layout(
    yaxis_title="Compression ratio",
    xaxis_title="Chunk size (Bytes)",
)
fig_decompression_throughput.show()

In [10]:
fig_throughput_ratio = px.line(
    all_df,
    title=f"nvCOMP decompression throughput per compression ratio", 
    x="ratio parsed",
    y="Decompression speed",
    color='Compressor parsed name',
    # log_x=True,
    markers=True,
    # width=800,
    # height=650,
)
fig_throughput_ratio.update_layout(
    yaxis_title="Decompression throughput (MB/s)",
    xaxis_title="Compression ratio",
)
fig_throughput_ratio.show()