In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import plotly.graph_objects as go
import plotly.express as px
from collections import defaultdict
from typing import Union

%matplotlib inline

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams["font.family"] = "Times New Roman"

In [2]:
datasets = [
    "breast_cancer",
    "glass",
    "iris",
    "letter",
    "mnist",
    "motion_sense",
    "satimage",
    "segment",
    "vehicle",
    "wine"
]

datasets_sizes = {
    "breast_cancer": 144,
    "glass": 24,
    "iris": 12,
    "letter": 2664,
    "mnist": 53672,
    "motion_sense": 12512,
    "satimage": 1864,
    "segment": 368,
    "vehicle": 132,
    "wine": 24
}

dataset_classes = {
    "breast_cancer": 2,
    "glass": 6,
    "iris": 3,
    "letter": 26,
    "mnist": 10,
    "motion_sense": 6,
    "satimage": 6,
    "segment": 7,
    "vehicle": 3,
    "wine": 3
}

dataset_train_samples = {
    "breast_cancer": 398,
    "glass": 149,
    "iris": 105,
    "letter": 1400,
    "mnist": 60000,
    "motion_sense": 3414,
    "satimage": 4501,
    "segment": 1617,
    "vehicle": 676,
    "wine": 124 
}

dataset_test_samples = {
    "breast_cancer": 171,
    "glass": 65,
    "iris": 45,
    "letter": 6000,
    "mnist": 10000,
    "motion_sense": 1020,
    "satimage": 1929,
    "segment": 693,
    "vehicle": 170,
    "wine": 54 
}

dataset_balanced = {
    "breast_cancer": "no",
    "glass": "no",
    "iris": "yes",
    "letter": "yes",
    "mnist": "yes",
    "motion_sense": "yes",
    "satimage": "no",
    "segment": "yes",
    "vehicle": "yes",
    "wine": "no"   
}

metrics_to_use = {
    "breast_cancer": "f1 weighted",
    "glass": "f1 weighted",
    "iris": "accuracy",
    "letter": "accuracy",
    "mnist": "accuracy",
    "motion_sense": "accuracy",
    "satimage": "f1 weighted",
    "segment": "accuracy",
    "vehicle": "accuracy",
    "wine": "f1 weighted"
}


dfs = [
    pd.read_csv(exp)
    for dset in datasets
    for exp in Path(f"{dset} experiment").glob("*.csv")
]

df = pd.concat(dfs)
df.replace({"mnist-dist-16": "mnist"}, inplace=True)
df
# d = json.loads(df.iloc[0]["encoder kwargs"])

Unnamed: 0,bleach,accuracy,f1 weighted,f1 macro,f1 micro,ties,run,train time,predict time,ram name,...,encoder kwargs,experiment name,model size,train samples,test samples,classes,rams per discriminator,discriminators,seed,indices
0,2,0.918129,0.918781,0.914107,0.918129,18,1,1.812207,0.481705,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9624,398,171,2,30,2,1670370805,480
1,5,0.929825,0.929825,0.925088,0.929825,7,1,1.812207,0.338999,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9624,398,171,2,30,2,1670370805,480
2,10,0.947368,0.947101,0.943263,0.947368,3,1,1.812207,0.338749,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9624,398,171,2,30,2,1670370805,480
3,2,0.929825,0.930384,0.926378,0.929825,26,2,0.497607,0.338319,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9720,398,171,2,30,2,1670370808,480
4,5,0.941520,0.941694,0.937954,0.941520,13,2,0.497607,0.339714,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9720,398,171,2,30,2,1670370808,480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,5,0.870370,0.869781,0.873519,0.870370,3,3,0.071017,0.087331,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208
432,8,0.888889,0.888060,0.892690,0.888889,6,3,0.071017,0.086613,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208
433,10,0.814815,0.812150,0.816460,0.814815,5,3,0.071017,0.081598,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208
434,15,0.814815,0.798173,0.808999,0.814815,4,3,0.071017,0.081574,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208


In [3]:
meta = defaultdict(dict)
for dset in datasets:
    meta[dset]["size (KB)"] = datasets_sizes[dset]
    meta[dset]["classes"] = dataset_classes[dset]
    meta[dset]["train samples"] = dataset_train_samples[dset]
    meta[dset]["test samples"] = dataset_test_samples[dset]
    meta[dset]["balanced"] = dataset_balanced[dset]

meta_df = pd.DataFrame(meta).T
meta_df

Unnamed: 0,size (KB),classes,train samples,test samples,balanced
breast_cancer,144,2,398,171,no
glass,24,6,149,65,no
iris,12,3,105,45,yes
letter,2664,26,1400,6000,yes
mnist,53672,10,60000,10000,yes
motion_sense,12512,6,3414,1020,yes
satimage,1864,6,4501,1929,no
segment,368,7,1617,693,yes
vehicle,132,3,676,170,yes
wine,24,3,124,54,no


In [4]:
latex_tables_path = Path("tables")
figures_path = Path("figures")

def write_figure(filename: str, fig: go.Figure, path: Union[Path, str] = figures_path):
    """Write a Figure to a file.

    Parameters
    ----------
    filename : str
        The name of the file to write to.
    fig : go.Figure
        The plotly figure object.
    path : Union[Path, str], optional
        The path where the file will be stored, by default figures_path
    """
    path = Path(path)
    path.mkdir(exist_ok=True, parents=True)
    fname = path/filename
    fig.write_image(fname)
    print(f"Figure written to: {fname}")
    print(f"Filename   :", filename)
    print(f"Latex label:", filename.replace(".pdf",""))
    
def write_latex_table(filename: str, table: str, path: Union[Path, str] = latex_tables_path):
    """Write a latex table to a file.

    Parameters
    ----------
    filename : str
        The name of the file to write to.
    table : str
        The table, as a string.
    path : Union[Path, str], optional
        The path where the file will be stored, by default latex_tables_path
    """
    path = Path(path)
    path.mkdir(exist_ok=True, parents=True)
    fname = path/filename
    with fname.open("w") as f:
        f.write("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n")
        f.write("%% WARNING: DO NOT CHANGE THIS FILE. IT IS GENERATED AUTOMATICALLY %\n")
        f.write("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n")
        f.write(table)
    print(f"Table written to: {fname}")



In [5]:
df = df[df["dataset name"] != "segment"]
df = df[df["ram kwargs"] != '{"est_elements": 1000, "false_positive_rate": 0.05}']
df = df[df["ram kwargs"] != '{"width": 1000, "depth": 5}']
df = df[df["ram kwargs"] != '{"width": 1000, "depth": 3}']
df


Unnamed: 0,bleach,accuracy,f1 weighted,f1 macro,f1 micro,ties,run,train time,predict time,ram name,...,encoder kwargs,experiment name,model size,train samples,test samples,classes,rams per discriminator,discriminators,seed,indices
0,2,0.918129,0.918781,0.914107,0.918129,18,1,1.812207,0.481705,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9624,398,171,2,30,2,1670370805,480
1,5,0.929825,0.929825,0.925088,0.929825,7,1,1.812207,0.338999,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9624,398,171,2,30,2,1670370805,480
2,10,0.947368,0.947101,0.943263,0.947368,3,1,1.812207,0.338749,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9624,398,171,2,30,2,1670370805,480
3,2,0.929825,0.930384,0.926378,0.929825,26,2,0.497607,0.338319,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9720,398,171,2,30,2,1670370808,480
4,5,0.941520,0.941694,0.937954,0.941520,13,2,0.497607,0.339714,DictRam,...,"{""resolution"": 16}",breast_cancer experiment,9720,398,171,2,30,2,1670370808,480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,5,0.870370,0.869781,0.873519,0.870370,3,3,0.071017,0.087331,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208
432,8,0.888889,0.888060,0.892690,0.888889,6,3,0.071017,0.086613,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208
433,10,0.814815,0.812150,0.816460,0.814815,5,3,0.071017,0.081598,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208
434,15,0.814815,0.798173,0.808999,0.814815,4,3,0.071017,0.081574,stream-threshold,...,"{""resolution"": 16}",wine experiment,117624,124,54,3,13,3,1670371352,208


In [6]:
rename_elements = {
    'DictRam {}': "Dict-WiSARD",
    'count-bloom {"est_elements": 100, "false_positive_rate": 0.02}': 'CB FPR=0.02',
    'count-bloom {"est_elements": 100, "false_positive_rate": 0.05}': 'CB FPR=0.05',
    'count-bloom {"est_elements": 100, "false_positive_rate": 0.08}': 'CB FPR=0.08',
    'count-min-sketch {"width": 20, "depth": 2}': 'CMS W=20 D=2',
    'count-min-sketch {"width": 20, "depth": 3}': 'CMS W=20 D=3',
    'count-min-sketch {"width": 50, "depth": 2}': 'CMS W=50 D=2',
    'count-min-sketch {"width": 50, "depth": 3}': 'CMS W=50 D=3',
    'count-min-sketch {"width": 100, "depth": 3}': 'CMS W=100 D=3',
    'count-min-sketch {"width": 100, "depth": 5}': 'CMS W=100 D=5',
    #'count-min-sketch {"width": 500, "depth": 3}': 'CMS W=500 D=3',
    # 'count-min-sketch {"width": 500, "depth": 5}': 'CMS W=500 D=5',
    # 'count-min-sketch {"width": 1000, "depth": 3}': 'CMS W=1000 D=3',
    # 'count-min-sketch {"width": 1000, "depth": 5}': 'CMS W=1000 D=5',
}

element_order = [
    "Dict-WiSARD",
    'CB FPR=0.02',
    'CB FPR=0.05',
    'CB FPR=0.08',
    'CMS W=20 D=2',
    'CMS W=20 D=3',
    'CMS W=50 D=2',
    'CMS W=50 D=3',
    'CMS W=100 D=3',
    'CMS W=100 D=5',
    #'CMS W=500 D=3',
    # 'CMS W=500 D=5',
    # 'CMS W=1000 D=3',
    # 'CMS W=1000 D=5'
]

datasets_order = [
    "iris",
    "glass",
    "wine",
    "vehicle",
    "breast_cancer",
    # "segment (368)",
    "satimage",
    'letter',
    'motion_sense',
    'mnist'
]

In [7]:
def get_best(df: pd.DataFrame, metric: str = None, improvement_col: str = "DictRam {}"):
    configs = list(k for k, _ in df[df["ram name"].isin(["DictRam", "count-min-sketch", "count-bloom"])].groupby(["ram name", "ram kwargs"]))
    datasets = list(df["dataset name"].unique())

    d = defaultdict(dict)
    for c in configs:
        c_str = f"{c[0]} {c[1]}"
        for dset in datasets:
            metric_to_use = metrics_to_use[dset]
            x = df.loc[(df["ram name"] == c[0]) & (df["ram kwargs"] == c[1]) & (df["dataset name"] == dset)]
            best = x.sort_values(by=metric_to_use, ascending=False).iloc[0]
            if metric is None:
                metric = metric_to_use
            d[c_str][dset] = best[metric]
            
    d = pd.DataFrame(d)
    if improvement_col is not None:
        r = d[improvement_col]
        for c in d.columns:
            d[c] = d[c] / r
                
    return pd.DataFrame(d)

def do_rename_and_reorder(d, remove_dict: bool = False):
    d = d.rename(columns=rename_elements)
    if remove_dict:
        d = d[element_order[1:]]
    else:
        d = d[element_order]
    d = d.T
    # d = d.rename(columns=datasets_rename)
    d = d[datasets_order]
    return d


In [8]:
d = get_best(df, metric="model size")
d = do_rename_and_reorder(d, remove_dict=True)
# d["max"] = d.max(axis=1)
# d["average"] = d.mean(axis=1)
# d["min"] = d.min(axis=1)

d = d.T

fig = px.imshow(
    d,
    text_auto=".2f",
    aspect="auto",
    # color_continuous_scale='RdBu_r',
    color_continuous_scale=[(0.0, "lightgreen"), (0.01, "lightyellow"), (1.0, "red")],
    color_continuous_midpoint=1.0,
    # zmax=2,
    zmin=0,
)

# fig.update_xaxes(side="top")
fig.update_layout(
    # xaxis_title="Filter",
    # yaxis_title="Dataset",
    width=1000,
    height=200,
    font_family="Times New Roman", 
    font_size=12, 
    margin=dict(l=0, r=0, t=10, b=0),
    xaxis=dict(tickangle=0, tickfont = dict(size=12)),
    yaxis=dict(tickangle=0, tickfont = dict(size=12))
)

write_figure("size_improvement.pdf", fig)

fig.show()

Figure written to: figures/size_improvement.pdf
Filename   : size_improvement.pdf
Latex label: size_improvement


In [9]:
d = get_best(df, metric=None)
d = do_rename_and_reorder(d, remove_dict=True)
d = d.T
d

dsets = list(d.index)
fig = go.Figure()
for i, c in enumerate(d.columns):
    x = dsets
    y = d[c]
    name = c
    fig.add_trace(
        go.Bar(
            x=x,
            y=y,
            # text="pop",
            # textfont="Times New Roman",
            # textposition="auto",
            name=name,
            marker_color=px.colors.qualitative.Plotly[i]
        )
    )
    
fig.add_hline(y=1, line_width=1, line_dash="dash", line_color="red")

fig.update_yaxes(range=[0.60, 1.15]) 

fig.update_layout(
    yaxis_title="Performance improvement",
    # xaxis_title="Dataset",
    width=800,
    height=200,
    font_family="Times New Roman", 
    font_size=12,
    margin=dict(l=0, r=0, t=10, b=0),
    xaxis=dict(tickfont = dict(size=12))
    # legend_title_text="Filter"
)

write_figure("accuracy_improvement.pdf", fig)
    
fig.show()

Figure written to: figures/accuracy_improvement.pdf
Filename   : accuracy_improvement.pdf
Latex label: accuracy_improvement


In [10]:
d = get_best(df, metric=None, improvement_col=None)
d = do_rename_and_reorder(d, remove_dict=False)
d = d.T

d_m = get_best(df, metric="model size", improvement_col=None)
d_m = do_rename_and_reorder(d_m, remove_dict=False)
d_m = d_m.T


acc_meta = meta_df.merge(d[["Dict-WiSARD"]], left_index=True, right_index=True)
acc_meta = acc_meta.rename(columns={"Dict-WiSARD": "Score"})
acc_meta = acc_meta.merge(d_m[["Dict-WiSARD"]]//1e3, left_index=True, right_index=True)
acc_meta = acc_meta.rename(columns={"Dict-WiSARD": "Memory Footprint (Kb)"})
acc_meta = acc_meta.reset_index().rename(columns={"index": "dataset"})
acc_meta = acc_meta.rename(columns={
    "dataset": "Dataset",
    "size (KB)": "Size (KB)",
    "classes": "Classes",
    "train samples": "#Train",
    "test samples": "#Test",
    "balanced": "Balanced",
})
latex_str = acc_meta.to_latex(
    float_format="%.2f",
    index=False,
    caption="Description of the datasets used in the experiments and the performance obtained using Dict WiSARD. For unbalanced datasets, the f1-score is reported as score, else the accuracy is reported.",
    label="tab:datasets",
)

write_latex_table("datasets.tex", latex_str)

print(latex_str)

Table written to: tables/datasets.tex
\begin{table}
\centering
\caption{Description of the datasets used in the experiments and the performance obtained using Dict WiSARD. For unbalanced datasets, the f1-score is reported as score, else the accuracy is reported.}
\label{tab:datasets}
\begin{tabular}{llllllrr}
\toprule
      Dataset & Size (KB) & Classes & \#Train & \#Test & Balanced &  Score &  Memory Footprint (Kb) \\
\midrule
breast\_cancer &       144 &       2 &    398 &   171 &       no &   0.95 &                   9.00 \\
        glass &        24 &       6 &    149 &    65 &       no &   0.64 &                  12.00 \\
         iris &        12 &       3 &    105 &    45 &      yes &   0.96 &                   1.00 \\
       letter &      2664 &      26 &   1400 &  6000 &      yes &   0.86 &                 301.00 \\
        mnist &     53672 &      10 &  60000 & 10000 &      yes &   0.91 &                8975.00 \\
 motion\_sense &     12512 &       6 &   3414 &  1020 &      y


In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.

