In [16]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from paretoset import paretoset
import plotly.express as px

ENWIK8_SIZE=1e8
ENWIK9_SIZE=1e9



hutter_csv = "../data/hutter/large_text_compression_benchmark.csv"

hutter_df = pd.read_csv(hutter_csv)



def ns_byte_to_mb_s(data):
    return 1/(data*(pow(10,-3)))


hutter_df['ratio_enwik8'] = ENWIK8_SIZE/hutter_df['enwik8']
hutter_df['ratio_enwik9'] = ENWIK9_SIZE/hutter_df['enwik9']

hutter_df['decomp_throughput'] = ns_byte_to_mb_s(hutter_df['decomp time'])
hutter_df['comp_throughput'] = ns_byte_to_mb_s(hutter_df['comp time'])

hutter_df.sort_values(["decomp time"],  
                    axis=0, 
                    ascending=[True],  
                    inplace=True, ignore_index=True)


hutter_df

Unnamed: 0,program,version,args,enwik8,enwik9,decompresser,Unnamed: 6,total size enwik9+prog,comp time,decomp time,mem,alg,Note,ratio_enwik8,ratio_enwik9,decomp_throughput,comp_throughput
0,ulz,0.06,c9,32945292.0,291028084,49450,x,291077534.0,325.0,1.1,490,LZ77,82.0,3.035335,3.436095,909.090909,3.076923
1,nakamichi,2019-Jul-01,,32917888.0,277293058,112899,s,277405957.0,8200000.0,1.3,302000,LZSS,85.0,3.037862,3.606293,769.230769,0.000122
2,lz4x,1.02,c4,41950112.0,372068437,48609,x,372117046.0,79.0,1.4,114,LZ77,68.0,2.383784,2.687678,714.285714,12.658228
3,lzf,1.02,cx,45198298.0,406805983,48359,x,406854342.0,68.0,2.2,151,LZ77,68.0,2.212473,2.458174,454.545455,14.705882
4,zstd,0.6.0,-22 --ultra,25405601.0,215674670,69687,s,215744357.0,701.0,2.2,792,LZ77,76.0,3.936140,4.636613,454.545455,1.426534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,cmix,v20,,14760552.0,109877715,241725,sd,110119440.0,621780.0,619024.0,31650,CM,83.0,6.774814,9.101027,0.001615,0.001608
203,cmve,0.2.0,-m230x7fed7dfd,16424248.0,129876858,307787,x,130184645.0,1140801.0,,19963,CM,81.0,6.088559,7.699601,,0.000877
204,WinRK,3.03,pwcm +td 800MB SFX,18612453.0,156291924,99665,xd,156391589.0,68555.0,,800,CM,10.0,5.372747,6.398283,,0.014587
205,bwtdisk,0.9.0,-b 2 -m 3500,24725277.0,190004306,169579,s,190173885.0,1124.0,,3500,BWT,48.0,4.044444,5.263039,,0.889680


In [17]:
custom_csv = "../data/hutter/customs.csv"
custom_df = pd.read_csv(custom_csv)


custom_df['ratio_enwik8'] = ENWIK8_SIZE/custom_df['enwik8']
custom_df['ratio_enwik9'] = ENWIK9_SIZE/custom_df['enwik9']

custom_df['decomp_throughput'] = ns_byte_to_mb_s(custom_df['decomp time'])
custom_df['comp_throughput'] = ns_byte_to_mb_s(custom_df['comp time'])

custom_df.sort_values(["decomp time"],  
                    axis=0, 
                    ascending=[True],  
                    inplace=True, ignore_index=True)

In [18]:

hutter_top_decomp_df = hutter_df.sort_values(['decomp_throughput'], ascending=False).head(10)



fig_hutter_top_decompression_8 = px.scatter(
    hutter_top_decomp_df,
    title=f"Hutter top decompression throughput", 
    x="ratio_enwik8",
    y="decomp_throughput",
    labels="program",
    color="program",
    width=800,
)


fig_hutter_top_decompression_9 = px.scatter(
    hutter_top_decomp_df,
    title=f"Hutter top decompression throughput", 
    x="ratio_enwik9",
    y="decomp_throughput",
    labels="program",
    color="program",
    width=800,
)


fig_hutter_top_decompression_8.update_layout(
    yaxis_title="Decompression throughput (MB/s)",
    xaxis_title="Compression ratio (enwik8)",
)

fig_hutter_top_decompression_9.update_layout(
    yaxis_title="Decompression throughput (MB/s)",
    xaxis_title="Compression ratio (enwik9)",
)

fig_hutter_top_decompression_8.show()
fig_hutter_top_decompression_9.show()


In [19]:

hutter_top_comp_df = hutter_df.sort_values(['decomp_throughput'], ascending=False).head(10)



fig_hutter_top_compression_8 = px.scatter(
    hutter_top_comp_df,
    title=f"Hutter top compression throughput", 
    x="ratio_enwik8",
    y="comp_throughput",
    labels="program",
    color="program",
    width=800,
)

fig_hutter_top_compression_9 = px.scatter(
    hutter_top_comp_df,
    title=f"Hutter top compression throughput", 
    x="ratio_enwik9",
    y="comp_throughput",
    labels="program",
    color="program",
    width=800,
)


fig_hutter_top_compression_8.update_layout(
    yaxis_title="Compression throughput (MB/s)",
    xaxis_title="Compression ratio (enwik8)",
)


fig_hutter_top_compression_9.update_layout(
    yaxis_title="Compression throughput (MB/s)",
    xaxis_title="Compression ratio (enwik9)",
)

fig_hutter_top_compression_8.show()
fig_hutter_top_compression_9.show()



In [20]:


hutter_comp_all_df = hutter_df.sort_values(['alg'], ascending=True)

# Count occurrences of each category
category_counts = hutter_df['alg'].value_counts()
hutter_comp_all_df['alg_count'] = hutter_comp_all_df['alg'].map(category_counts)

hutter_comp_all_df = hutter_comp_all_df.sort_values(['alg_count'], ascending=False)


mask = paretoset(hutter_comp_all_df[['ratio_enwik8', 'comp_throughput']], sense=["max", "max"])

pareto_points = hutter_comp_all_df[mask]
pareto_points.sort_values(['comp_throughput'], inplace=True)

hutter_comp_all_df['pareto'] = mask*4 +1


fig_hutter_compression_all = px.scatter(
    hutter_comp_all_df,
    title=f"Large Text Compression Benchmark compression throughput: Algorithms", 
    x="ratio_enwik8",
    y="comp_throughput",
    log_y=True,
    # labels="program",
    hover_data=['program'],
    color="alg",
    size='pareto',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    width=800,
    height=650,
)
fig_hutter_compression_all.add_scatter(
    x=pareto_points['ratio_enwik8'], 
    y=pareto_points['comp_throughput'], 
    mode='lines',
    line=dict(color="#000000"),
    name="Pareto front"
)

fig_hutter_compression_all.update_layout(
    yaxis_title="Compression throughput (MB/s)",
    xaxis_title="Compression ratio",
)

fig_hutter_compression_all.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [21]:
hutter_decomp_all_df = hutter_df.sort_values(['alg'], ascending=True)

# Count occurrences of each category
category_counts = hutter_df['alg'].value_counts()
hutter_decomp_all_df['alg_count'] = hutter_decomp_all_df['alg'].map(category_counts)

hutter_decomp_all_df = hutter_decomp_all_df.sort_values(['alg_count'], ascending=False)

mask = paretoset(hutter_decomp_all_df[['ratio_enwik8', 'decomp_throughput']], sense=["max", "max"])
pareto_points = hutter_decomp_all_df[mask]
pareto_points.sort_values(['decomp_throughput'], inplace=True)

hutter_decomp_all_df['pareto'] = mask*4 +1


fig_hutter_decompression_all = px.scatter(
    hutter_decomp_all_df,
    title=f"Large Text Compression Benchmark decompression throughput", 
    x="ratio_enwik8",
    y="decomp_throughput",
    log_y=True,
    hover_data=['program'],
    # labels="program",
    color="alg",
    size='pareto',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    width=800,
    height=650,
)

fig_hutter_decompression_all.add_scatter(
    x=pareto_points['ratio_enwik8'], 
    y=pareto_points['decomp_throughput'], 
    mode='lines',
    line=dict(color="#000000"),
    name="Pareto front"
)




fig_hutter_decompression_all.update_layout(
    yaxis_title="Decompression throughput (MB/s)",
    xaxis_title="Compression ratio",
)

fig_hutter_decompression_all.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
custom_comp_all_df = custom_df.sort_values(['alg'], ascending=True)

# Count occurrences of each category
category_counts = custom_df['alg'].value_counts()
custom_comp_all_df['alg_count'] = custom_comp_all_df['alg'].map(category_counts)

custom_comp_all_df = custom_comp_all_df.sort_values(['alg_count'], ascending=False)


mask = paretoset(custom_comp_all_df[['ratio_enwik8', 'comp_throughput']], sense=["max", "max"])

pareto_points = custom_comp_all_df[mask]
pareto_points.sort_values(['comp_throughput'], inplace=True)

custom_comp_all_df['pareto'] = mask*4 +1


fig_custom_compression_all = px.scatter(
    custom_comp_all_df,
    title=f"Large Text Compression Benchmark compression throughput: Algorithms", 
    x="ratio_enwik8",
    y="comp_throughput",
    hover_data=['program'],
    # log_y=True,
    # labels="program",
    color="program",
    size='pareto',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    width=800,
    height=650,
)
fig_custom_compression_all.add_scatter(
    x=pareto_points['ratio_enwik8'], 
    y=pareto_points['comp_throughput'], 
    mode='lines',
    line=dict(color="#000000"),
    name="Pareto front"
)

fig_custom_compression_all.update_layout(
    yaxis_title="Compression throughput (MB/s)",
    xaxis_title="Compression ratio",
)

fig_custom_compression_all.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
custom_decomp_all_df = custom_df.sort_values(['alg'], ascending=True)

# Count occurrences of each category
category_counts = custom_df['alg'].value_counts()
custom_decomp_all_df['alg_count'] = custom_decomp_all_df['alg'].map(category_counts)

custom_decomp_all_df = custom_decomp_all_df.sort_values(['alg_count'], ascending=False)

mask = paretoset(custom_decomp_all_df[['ratio_enwik8', 'decomp_throughput']], sense=["max", "max"])
pareto_points = custom_decomp_all_df[mask]
pareto_points.sort_values(['decomp_throughput'], inplace=True)

custom_decomp_all_df['pareto'] = mask*4 +1


fig_custom_decompression_all = px.scatter(
    custom_decomp_all_df,
    title=f"Large Text Compression Benchmark decompression throughput", 
    x="ratio_enwik8",
    y="decomp_throughput",
    hover_data=['program'],
    # log_y=True,
    # labels="program",
    color="program",
    size='pareto',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    width=800,
    height=650,
)

fig_custom_decompression_all.add_scatter(
    x=pareto_points['ratio_enwik8'], 
    y=pareto_points['decomp_throughput'], 
    mode='lines',
    line=dict(color="#000000"),
    name="Pareto front"
)




fig_custom_decompression_all.update_layout(
    yaxis_title="Decompression throughput (MB/s)",
    xaxis_title="Compression ratio",
)

fig_custom_decompression_all.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

