In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [2]:
df = pd.read_csv("compression_test_results.csv")
df.head(10)

Unnamed: 0,category,file,bwt,mtf,rle,original_size,compressed_size,compression_ratio,time_seconds
0,base64_noise,base64_11.txt,0,0,0,1000000,750153,0.750153,0.041991
1,base64_noise,base64_11.txt,0,0,1,1000000,838169,0.838169,0.045911
2,base64_noise,base64_11.txt,0,1,0,1000000,752119,0.752119,0.091623
3,base64_noise,base64_11.txt,0,1,1,1000000,840200,0.8402,0.097812
4,base64_noise,base64_11.txt,1,0,0,1000000,752079,0.752079,0.089726
5,base64_noise,base64_11.txt,1,0,1,1000000,840094,0.840094,0.094161
6,base64_noise,base64_11.txt,1,1,0,1000000,752121,0.752121,0.136637
7,base64_noise,base64_11.txt,1,1,1,1000000,840190,0.84019,0.144113
8,base64_noise,base64_0.txt,0,0,0,1000000,750153,0.750153,0.042071
9,base64_noise,base64_0.txt,0,0,1,1000000,838315,0.838315,0.046303


In [3]:
df[df['category'] != 'structured_size']

Unnamed: 0,category,file,bwt,mtf,rle,original_size,compressed_size,compression_ratio,time_seconds
0,base64_noise,base64_11.txt,0,0,0,1000000,750153,0.750153,0.041991
1,base64_noise,base64_11.txt,0,0,1,1000000,838169,0.838169,0.045911
2,base64_noise,base64_11.txt,0,1,0,1000000,752119,0.752119,0.091623
3,base64_noise,base64_11.txt,0,1,1,1000000,840200,0.840200,0.097812
4,base64_noise,base64_11.txt,1,0,0,1000000,752079,0.752079,0.089726
...,...,...,...,...,...,...,...,...,...
1683,xml_structured,xml_8.txt,0,1,1,721060,350616,0.486251,0.027088
1684,xml_structured,xml_8.txt,1,0,0,721060,393382,0.545561,0.072223
1685,xml_structured,xml_8.txt,1,0,1,721060,74392,0.103170,0.065132
1686,xml_structured,xml_8.txt,1,1,0,721060,129432,0.179502,0.069721


In [None]:
category_frames = {cat: df_cat for cat, df_cat in df.groupby("category")}
base64_df = df[df['category'] != 'structured_size']
if base64_df is not None:
    base64_df["pipeline"] = (
        base64_df["bwt"].astype(str)
        + base64_df["mtf"].astype(str)
        + base64_df["rle"].astype(str)
    )
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        data=base64_df,
        x="compression_ratio",
        y="time_seconds",
        hue="pipeline",
        palette="tab10",
        s=200,
        alpha=0.5,
    )
    sns.move_legend(loc="upper left", bbox_to_anchor=(2, 20))
    plt.title("Compression Ratio vs Time (base64_noise)")
    plt.xlabel("Compression Ratio (compressed/original)")
    plt.ylabel("Compression Time (seconds)")
    plt.tight_layout()
    plt.show()
else:
    print("Category 'base64_noise' not found in data.")

In [None]:
csv_file = "compression_test_results.csv"
df = pd.read_csv(csv_file)
df["pipeline"] = (
    df["bwt"].astype(str) +
    df["mtf"].astype(str) +
    df["rle"].astype(str)
)
plt.figure(figsize=(14, 6))
ax = sns.barplot(
    data=df,
    x="category",
    y="compression_ratio",
    hue="pipeline",
    ci="sd",  
    errorbar="sd",
    alpha=0.3
)
plt.title("Average Compression Ratio per Category by Pipeline")
plt.xlabel("Test Category")
plt.ylabel("Compression Ratio (compressed/original)")
plt.xticks(rotation=45)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

In [None]:
def describe_pipeline(flags):
    bwt, mtf, rle = flags
    parts = []
    if bwt == '1': parts.append('BWT')
    if mtf == '1': parts.append('MTF')
    if rle == '1': parts.append('RLE')
    if not parts:
        return 'Huffman only'
    return ' + '.join(parts) + ' + Huffman'

df['flags'] = df['bwt'].astype(str) + df['mtf'].astype(str) + df['rle'].astype(str)
df['pipeline_text'] = df['flags'].apply(describe_pipeline)
df['time_seconds'] = df['time_seconds'].astype(float)
pastel_colors = px.colors.qualitative.Pastel
for cat in df['category'].unique():
    df_cat = df[df['category'] == cat]
    agg = (
        df_cat
        .groupby('pipeline_text')['time_seconds']
        .agg(mean='mean', std='std')
        .reset_index()
    )
    
    fig = px.bar(
        agg,
        x='pipeline_text',
        y='mean',
        error_y='std',
        color='pipeline_text',
        color_discrete_sequence=pastel_colors,
        labels={
            'pipeline_text': 'Pipeline Configuration',
            'mean': 'Avg. Compression Ratio'
        },
        title=f"Compression Ratio by Pipeline — {cat}"
    )
    
    fig.update_layout(
        title_x=0.5,
        legend_title_text='Pipeline',
        xaxis_tickangle=45,
        showlegend=False,
        font_family="serif"
    )
    
    # Save vector SVG
    # fname = f"compression_ratio_{cat.lower().replace(' ', '_')}.svg"
    # fig.write_image(fname)
    
    fig.show()

In [40]:
import plotly.graph_objects as go
df = pd.read_csv("results.csv")
dfc = df[df.phase == "compress"].copy()
dfc["size_MB"] = dfc["size_MB"].astype(float)
dfc["time_s"]  = dfc["time_s"].astype(float)
fig = go.Figure()
for algo in dfc.type.unique():
    sub = dfc[dfc.type == algo]
    fig.add_trace(go.Scatter(
        x=sub.size_MB,
        y=sub.time_s,
        mode='markers',
        name=algo,
        marker=dict(size=10),
    
    ))
x_min, x_max = dfc.size_MB.min(), dfc.size_MB.max()
x_line = np.linspace(x_min, x_max, 200)

a = 0.08
b = 2.25
c = 0.7
y1 = (x_line * a) * np.log(x_line * b) + c

#    y2 = m x + d
m, d = 0.4, 0.0
y2 = m * x_line + d

fig.add_trace(go.Scatter(
    x=x_line, y=y1,
    mode='lines',
    name=r'linearithmic',
    line=dict(width=3, dash='dash')
))
fig.add_trace(go.Scatter(
    x=x_line, y=y2,
    mode='lines',
    name=r'linear',
    line=dict(width=3, dash='dot')
))
fig.update_layout(
    title="Compression Run Time (s) vs Size (MB)",
    title_x=0.5,
    xaxis_title="File Size (MB)",
    yaxis_title="Compression Time (s)",
    legend_title="Legend",
    template="plotly_white",
    font_family="serif",
)
fig.show()


In [41]:
fig.write_image("Runtime_Size.svg")

In [42]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
df = pd.read_csv("results.csv")
dfc = df[df.phase == "decompress"].copy()
dfc["size_MB"] = dfc["size_MB"].astype(float)
dfc["time_s"]  = dfc["time_s"].astype(float)
fig = go.Figure()
for algo in dfc.type.unique():
    sub = dfc[dfc.type == algo]
    fig.add_trace(go.Scatter(
        x=sub.size_MB,
        y=sub.time_s,
        mode='markers',
        name=algo,
        marker=dict(size=10),
    
    ))
x_min, x_max = dfc.size_MB.min(), dfc.size_MB.max()
x_line = np.linspace(x_min, x_max, 200)

#    y2 = m x + d
m, d = 0.18, -0.5
y2 = m * x_line + d
fig.add_trace(go.Scatter(
    x=x_line, y=y2,
    mode='lines',
    name=r'linear',
    line=dict(width=3, dash='dot')
))
fig.update_layout(
    title="Decompression Run Time vs Size",
    title_x=0.5,
    xaxis_title="File Size (MB)",
    yaxis_title="Decompression Time (s)",
    legend_title="Legend",
    font_family="serif",
    template="plotly_white"
)
fig.show()

In [43]:
fig.write_image("Decomp_Runtime_Size.svg")