In [371]:
import os
from typing import List, Dict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px

WIDTH = 380
HEIGHT = 450

In [393]:
RESULTS_LOCATION = './comparison_results/'
ENSEMBLES = ['ONSBoost', 'OB', 'OnlineBoosting', 'OOB', 'UOB']
METRICS = ['recall', 'specificity', 'precision', 'balanced_accuracy_score', 'f1_score', 'geometric_mean_score_1']
RST: list[int] = [1000, 100000, 101010, 10110, 101101, 1001, 10101010, 101, 110, 1337]
BASE: list[str] = ['GaussianNB', 'SGDClassifier']
BASE_SHORT=dict(GaussianNB='GNB', SGDClassifier='SGD')
STREAMS: list[str] = ['cdi__w2_4_0,9', 'cdi__w2_5_0,75', 'disco__w2_5_0,9']
STREAMS_NAMES: dict[str, str] = {
    'cdi__w2_4_0,9': 'CDI1',
    'cdi__w2_5_0,75': 'CDI2',
    'disco__w2_5_0,9': 'DISCO1'
}
STREAMS_LATEX: dict[str, str] = {
    'cdi__w2_4_0,9': 'CDI\\textsubscript{1}',
    'cdi__w2_5_0,75': 'CDI\\textsubscript{2}',
    'disco__w2_5_0,9': 'DISCO\\textsubscript{1}'
}
METRICS_NAMES: dict[str, str] = {
    'recall': 'czułość (recall)',
    'specificity': 'swoistość (specificity)',
    'precision': 'precyzja (precision)',
    'balanced_accuracy_score': 'zbalansowana dokładność (BAC)',
    'f1_score': 'F1',
    'geometric_mean_score_1': 'Gmean'
}
METRICS_NAMES_EN: dict[str, str] = {
    'recall': 'recall',
    'specificity': 'specificity',
    'precision': 'precision',
    'balanced_accuracy_score': 'BAC',
    'f1_score': 'F1',
    'geometric_mean_score_1': 'Gmean'
}
METRICS_NAMES_SHORT: dict[str, str] = {
    'recall': 'czułość',
    'specificity': 'swoistość',
    'precision': 'precyzja',
    'balanced_accuracy_score': 'BAC',
    'f1_score': 'F1',
    'geometric_mean_score_1': 'Gmean'
}
METRIC_SS: dict[str, str] = dict(
    recall='TPR',
    specificity='TNR',
    precision='PPV',
    balanced_accuracy_score='BAC',
    f1_score='F1',
    geometric_mean_score_1='Gmean'
)
ENSEMBLES_NAMES_SHORT: dict[str, str] = {
    'ONSBoost': 'ONS',
    'OnlineBoosting': 'OBo',
    'OB': 'OBa',
    'OOB': 'OOB',
    'UOB': 'UOB'
}
ENAME_RANKS: dict[str, str] = {
    'ONSBoost': 'ONS\\textsubscript{1}',
    'OnlineBoosting': 'OBo\\textsubscript{3}',
    'OB': 'OBa\\textsubscript{2}',
    'OOB': 'OOB\\textsubscript{4}',
    'UOB': 'UOB\\textsubscript{5}'
}
ENAME_RANKS_ORDER = [ENAME_RANKS[name] for name in ENSEMBLES]
ENAME_SHORT_ORDER = [ENSEMBLES_NAMES_SHORT[name] for name in ENSEMBLES]
METRICS_LATEX = {
    'recall': 'TPR',
    'specificity': 'TNR',
    'precision': 'PPV',
    'balanced_accuracy_score': 'BAC',
    'f1_score': 'F\\textsubscript{1}',
    'geometric_mean_score_1': 'G\\textsubscipt{mean}'
}

In [373]:
def load_all_data(directory: str):
    _results_files = [file for file in os.listdir(directory)
                      if not os.path.isdir(os.path.join(directory, file))
                      and file.startswith('v1')]
    _files = [(file, np.load(os.path.abspath(os.path.join(RESULTS_LOCATION, file)))) for file in _results_files]
    acc = []
    for file, data in _files:
        rst = file.split('__RST_')[1].split('.')[0]
        stream = file.split('++')[1].split('__NC')[0]
        base_clf = file.split('++')[0].split('v1')[1]
        for i, ensemble in enumerate(ENSEMBLES):
            current = {
                'base': base_clf,
                'rst': rst,
                'stream': stream,
                'ensemble': ensemble
            }
            for j, metric in enumerate(METRICS):
                mean = data[i, :, j].mean(axis=0)
                current[metric] = mean
            acc.append(current)

    return pd.DataFrame(acc)


all_data = load_all_data(RESULTS_LOCATION)
all_data

Unnamed: 0,base,rst,stream,ensemble,recall,specificity,precision,balanced_accuracy_score,f1_score,geometric_mean_score_1
0,GaussianNB,1000,"cdi__w2_4_0,9",ONSBoost,0.892209,0.876253,0.925467,0.884231,0.906098,0.876541
1,GaussianNB,1000,"cdi__w2_4_0,9",OB,0.811745,0.886501,0.812915,0.849123,0.802972,0.845808
2,GaussianNB,1000,"cdi__w2_4_0,9",OnlineBoosting,0.885987,0.879433,0.912265,0.882710,0.895734,0.875945
3,GaussianNB,1000,"cdi__w2_4_0,9",OOB,0.853600,0.864099,0.779707,0.858850,0.798970,0.857634
4,GaussianNB,1000,"cdi__w2_4_0,9",UOB,0.867418,0.852961,0.764479,0.860190,0.792685,0.859246
...,...,...,...,...,...,...,...,...,...,...
295,SGDClassifier,1337,"disco__w2_5_0,9",ONSBoost,0.817838,0.792768,0.849088,0.805303,0.825663,0.787404
296,SGDClassifier,1337,"disco__w2_5_0,9",OB,0.840182,0.801980,0.884567,0.821081,0.854779,0.802838
297,SGDClassifier,1337,"disco__w2_5_0,9",OnlineBoosting,0.817117,0.794560,0.848100,0.805838,0.824475,0.788723
298,SGDClassifier,1337,"disco__w2_5_0,9",OOB,0.868140,0.846404,0.807871,0.857272,0.826707,0.854027


In [374]:
def extract_data_for(df: pd.DataFrame, base: str, stream: str, ensemble: str):
    return df[(df['base'] == base) & (df['stream'] == stream) & (df['ensemble'] == ensemble)]


def draw_boxplot_for(df: pd.DataFrame, base: str, stream: str, metric: str) -> plt.Figure:
    fig, ax = plt.subplots()
    boxplot_data = np.zeros((len(RST), len(ENSEMBLES))).T
    for i, ensemble in enumerate(ENSEMBLES):
        ensemble_scores = extract_data_for(df, base, stream, ensemble)
        ensemble_scores = ensemble_scores[metric].to_numpy()
        boxplot_data[i] = ensemble_scores
    ax.boxplot(boxplot_data.T, labels=ENSEMBLES)
    ax.set_ylabel(METRICS_NAMES[metric])
    ax.set_xlabel('zespół klasyfikatorów')
    fig.suptitle(f'{base} / {STREAMS_NAMES[stream]} / {METRICS_NAMES[metric]}')
    return fig


def draw_all_boxplots(df: pd.DataFrame):
    for base in BASE:
        for stream in STREAMS:
            for metric in METRICS:
                # fig = draw_boxplot_for(df, base, stream, metric)
                # fig.savefig(f'{base}-{STREAMS_NAMES[stream]}-{metric}.svg')
                pass


draw_all_boxplots(all_data)

In [375]:
def draw_polar_px(df: pd.DataFrame, base, stream):
    _df = df.groupby(by=['base', 'stream', 'ensemble'], group_keys=False).mean()
    _df = _df.loc[base, stream].drop(['rst'], axis=1)
    _title = f'{base} / {STREAMS_NAMES[stream]}'
    _metric_values = []
    _metric_names = []
    _ensembles = []
    for metric_name, scores in _df.to_dict().items():
        for ensemble, metric_value in list(sorted(scores.items())):
            _metric_values.append(metric_value)
            _metric_names.append(METRIC_SS[metric_name])
            _ensembles.append(ENSEMBLES_NAMES_SHORT[ensemble])
    _plot_data = pd.DataFrame({'value': _metric_values, 'variable': _metric_names, 'group': _ensembles})
    fig = px.line_polar(data_frame=_plot_data,
                        r='value',
                        theta='variable',
                        color='group',
                        line_dash='group',
                        line_close=True,
                        range_r=[.7, .9],
                        title=_title,
                        category_orders=dict(variable=list(METRIC_SS.values()),
                                             group=list(ENSEMBLES_NAMES_SHORT.values())),
                        width=WIDTH,
                        height=HEIGHT)
    fig.update_polars(radialaxis=dict(
                          tickfont=dict(
                              size=9
                          )
                      ))
    fig.update_layout(legend=dict(orientation='h',
                                  xanchor='center',
                                  x=.5,
                                  title=None),
                      title=dict(automargin=False),
                      template='gridon',
                      polar=dict(gridshape='circular'),
                      margin=dict(r=0, l=0, t=60, b=0))

    return fig


draw_polar_px(all_data, 'SGDClassifier', 'cdi__w2_4_0,9')

In [376]:
WIDTH = 380
HEIGHT = 410
def draw_all_polars(df: pd.DataFrame):
    for base in BASE:
        for stream in STREAMS:
            draw_polar_px(df, base, stream).write_image(
                f'./comparison_plots/{base}--{STREAMS_NAMES[stream]}.svg', width=WIDTH, height=HEIGHT
            )



draw_all_polars(all_data)

In [377]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

def draw_cd(plot_values, cd_value, title=None):
    limits = (1, len(plot_values))
    fig, ax = plt.subplots(figsize=(7.5, 1.8))

    plot_values_ = sorted(plot_values, key=lambda l: l[1])
    # set up plot
    ax.set_xlim(limits)
    ax.set_ylim(0, 1)
    ax.spines['top'].set_position(('axes', 0.6))
    ax.xaxis.set_ticks_position('top')
    ax.yaxis.set_visible(False)
    k_ = len(plot_values)

    for pos in ["bottom", "left", "right"]:
        ax.spines[pos].set_visible(False)
    # CD bar
    ax.plot([limits[0], limits[0] + cd_value], [.9, .9], color="k")
    ax.plot([limits[0], limits[0]], [.9 - 0.03, .9 + 0.03], color="k")
    ax.plot([limits[0] + cd_value, limits[0] + cd_value], [.9 - 0.03, .9 + 0.03], color="k")
    ax.text(limits[0] + cd_value / 2., 0.92, "CD", ha="center", va="bottom")
    if title:
        ax.text(limits[1] - 1, 0.98, title)
    #bars
    bars = []
    for i_ in range(k_):
        bar = list(range(i_, k_))
        for j_ in range(k_):
            if np.abs(plot_values_[i_][1] - plot_values_[j_][1]) >= cd_value:
                try:
                    bar.pop(bar.index(j_))
                except:
                    pass
        if len(bars) > 0:
            if bars[-1][-1] != bar[-1]:
                bars.append(bar)
        else:
            bars.append(bar)
    bars = [bar for bar in bars if bar[0] != bar[-1]]
    # bars
    for i_, bar in enumerate(bars):
        lower = plot_values_[bar[0]][1] - .05
        upper = plot_values_[bar[-1]][1] + .05
        ax.plot([lower, upper], [-i_ * .07 + 0.55, -i_ * .07 + 0.55], color="k", lw=3)
    # annotations
    kw = dict(
        xycoords='data',
        textcoords="axes fraction",
        arrowprops=dict(arrowstyle="-", connectionstyle="angle,angleA=0,angleB=90"),
        bbox=dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72),
        va="center",
        fontsize='x-small'
    )
    left_ranks = plot_values_[:len(plot_values_) // 2]
    for i_, (name, rank) in enumerate(left_ranks):
        ax.annotate(name, xy=(rank, 0.6), xytext=(0, -i_ * 0.15), ha="right", **kw)
        if rank != limits[0] and rank != limits[:-1]:
            ax.text(x = rank - .15, y = .03 + -i_ * 0.15, s = '{:.2f}'.format(rank), fontsize='x-small')
    right_ranks = plot_values_[len(plot_values_) // 2:]
    for i_, (name, rank) in enumerate(reversed(right_ranks)):
        ax.annotate(name, xy=(rank, 0.6), xytext=(1, -i_ * 0.15), ha="left", **kw)
        if rank != k_:
            ax.text(x = rank + .05, y = .03 + -i_ * 0.15, s = '{:.2f}'.format(rank), fontsize='x-small')

    return fig

In [378]:
from scikit_posthocs import posthoc_nemenyi_friedman
from scipy.stats import friedmanchisquare, rankdata

R_STATES = ['1000', '100000', '101010', '10110', '101101', '1001', '10101010', '101', '110', '1337']
# STREAMS
# ENSEMBLES
def calc_mean_scores(df: pd.DataFrame, stream: str, metric: str, base:str):
    mean_scores = np.zeros((len(RST), len(ENSEMBLES)))
    for r_i, rst in enumerate(R_STATES):
        current_case = df[
            (df['stream'] == stream)
            & (df['rst'] == rst)
            & (df['base'] == base)
        ]
        for c_i, clf in enumerate(ENSEMBLES):
            clf_scores = current_case[current_case['ensemble'] == clf][metric].mean()
            mean_scores[r_i, c_i] = clf_scores
    return mean_scores

def calc_ranks(mean_scores: np.ndarray):
    ranks = np.array([rankdata(ms) for ms in mean_scores])
    return ranks, np.mean(ranks, axis=0)

means = calc_mean_scores(all_data, STREAMS[0], METRICS[0], BASE[0])
friedmanchisquare(*means)
Q_ALPHA = 2.728
means

array([[0.89220928, 0.81174488, 0.88598689, 0.85360009, 0.86741798],
       [0.81627   , 0.73662692, 0.80935364, 0.82217661, 0.85217711],
       [0.83323729, 0.80756067, 0.81278445, 0.88384407, 0.90719633],
       [0.75289463, 0.65862031, 0.70691293, 0.7975044 , 0.84974567],
       [0.78541939, 0.75900737, 0.75115331, 0.83383379, 0.85749442],
       [0.77502284, 0.73547966, 0.7866314 , 0.7848435 , 0.80995393],
       [0.84188503, 0.81373914, 0.84636307, 0.84348228, 0.85407718],
       [0.84919609, 0.88368621, 0.85152859, 0.94635035, 0.95498326],
       [0.8390158 , 0.87254705, 0.8497543 , 0.89439223, 0.90205274],
       [0.74943952, 0.68491093, 0.72861051, 0.77476096, 0.80508106]])

In [379]:
plt.ioff()
draw_cd(list(zip(ENSEMBLES_NAMES_SHORT, calc_ranks(means)[1])), Q_ALPHA * np.sqrt(len(ENSEMBLES) * (len(ENSEMBLES) + 1) / 6 / len(RST)), 'GMean / DISCO1')
plt.ion()
# draw_cd()

<contextlib.ExitStack at 0x28b7fe931d0>

In [380]:
def draw_all_cd(df: pd.DataFrame, ensembles: list[str]):
    # q_alpha taken from: https://plos.figshare.com/articles/dataset/Critical_values_for_the_two-tailed_Nemenyi_test_after_the_Friedman_test_/5434030/1
    _Q_ALPHA = 2.728
    _cd_value = Q_ALPHA * np.sqrt(len(ENSEMBLES) * (len(ENSEMBLES) + 1) / 6 / len(RST))
    _ALPHA = 0.05

    for stream in STREAMS:
        for metric in METRICS:
            for base in BASE:
                _means = calc_mean_scores(df, stream, metric, base)
                chi, _p = friedmanchisquare(*_means)
                if not np.isnan(_p) and (_p < _ALPHA or chi > Q_ALPHA):
                    _ranks, _mean_ranks = calc_ranks(_means)
                    plt.ioff()
                    fig = draw_cd(list(zip(ensembles, len(ensembles) + 1 - _mean_ranks)),
                                  _cd_value,
                                  f'{BASE_SHORT[base]} {STREAMS_NAMES[stream]}/{METRIC_SS[metric]}')
                    fig.savefig(f'./comparison_plots/cd/cd-{STREAMS_NAMES[stream]}-{METRIC_SS[metric]}-{BASE_SHORT[base]}.svg', bbox_inches=None)
                else:
                    print(f'no difference for {stream}, {metric} and {base}')

def calc_friedmann(df: pd.DataFrame):
    # q_alpha taken from: https://plos.figshare.com/articles/dataset/Critical_values_for_the_two-tailed_Nemenyi_test_after_the_Friedman_test_/5434030/1
    _Q_ALPHA = 2.728
    _cd_value = Q_ALPHA * np.sqrt(len(ENSEMBLES) * (len(ENSEMBLES) + 1) / 6 / len(RST))
    _ALPHA = 0.05
    acc = { stream: { metric: 0 for metric in METRIC_SS.values() } for stream in STREAMS_NAMES.values() }
    result = np.full((len(METRICS), len(STREAMS) * len(BASE)), dtype='U22', fill_value='')
    for i, stream in enumerate(STREAMS):
        for j, metric in enumerate(METRICS):
            for k, base in enumerate(BASE):
                _means = calc_mean_scores(df, stream, metric, base)
                pval = friedmanchisquare(*_means)[1]
                ct = 0
                if np.isnan(pval):
                    ct_ = '\multicolumn{1}{c}{-}'
                else:
                    while pval < 1:
                        pval *= 10
                        ct -= 1
                    ct_ = f'${pval:.3f} \\cdot 10^' + '{' + str(ct) + '}$'
                acc[STREAMS_NAMES[stream]][METRIC_SS[metric]] = ct_
                result[j, k * len(STREAMS) + i] = ct_
    return acc, result
p, ppp = calc_friedmann(all_data)
# pp = pd.DataFrame.from_dict(p).to_latex()
ptbl = np.concatenate([np.array([list(METRIC_SS.values())]).T, ppp], axis=1)
header = '\\toprule \n & ' + ' & '.join(['\\multicolumn{3}{c}{' + base + '}' for base in BASE])
sheader = ' & '.join(
    [*['\\multicolumn{1}{c}{' + STREAMS_NAMES[stream] + '}' for stream in STREAMS[:-1]],
]
)
sheader += ' & ' +\
           sheader +\
           '\\multicolumn{1}{c|}{' +\
           STREAMS_NAMES[STREAMS[-1]] + '}' +\
           sheader +\
           '\\\\ \n \\midrule'
table_content = ' \\\\ \n'.join([' & '.join(r) for r in ptbl]) + '\\\\ \n\\bottomrule'
print(header)
print(sheader)
print(table_content)
# ppp

\toprule 
 & \multicolumn{3}{c}{GaussianNB} & \multicolumn{3}{c}{SGDClassifier}
\multicolumn{1}{c}{CDI1} & \multicolumn{1}{c}{CDI2} & \multicolumn{1}{c}{CDI1} & \multicolumn{1}{c}{CDI2}\multicolumn{1}{c|}{DISCO1}\multicolumn{1}{c}{CDI1} & \multicolumn{1}{c}{CDI2}\\ 
 \midrule
TPR & $6.990 \cdot 10^{-6}$ & $1.762 \cdot 10^{-5}$ & $1.312 \cdot 10^{-4}$ & $3.012 \cdot 10^{-6}$ & $3.489 \cdot 10^{-6}$ & $8.763 \cdot 10^{-5}$ \\ 
TNR & $4.191 \cdot 10^{-2}$ & $6.902 \cdot 10^{-3}$ & $3.890 \cdot 10^{-3}$ & $6.740 \cdot 10^{-6}$ & $4.852 \cdot 10^{-6}$ & $1.610 \cdot 10^{-5}$ \\ 
PPV & \multicolumn{1}{c}{-} & $6.517 \cdot 10^{-4}$ & $4.716 \cdot 10^{-5}$ & \multicolumn{1}{c}{-} & $2.460 \cdot 10^{-6}$ & \multicolumn{1}{c}{-} \\ 
BAC & $6.618 \cdot 10^{-6}$ & $7.249 \cdot 10^{-6}$ & $4.551 \cdot 10^{-5}$ & $1.070 \cdot 10^{-6}$ & $9.938 \cdot 10^{-7}$ & $2.202 \cdot 10^{-6}$ \\ 
F1 & $8.696 \cdot 10^{-6}$ & $1.024 \cdot 10^{-5}$ & $1.121 \cdot 10^{-5}$ & $1.551 \cdot 10^{-6}$ & $1.153 \cdot 1

In [381]:
plt.ioff()
draw_all_cd(all_data, list(ENSEMBLES_NAMES_SHORT.values()))
plt.ion()

no difference for cdi__w2_4_0,9, precision and GaussianNB
no difference for cdi__w2_4_0,9, precision and SGDClassifier
no difference for disco__w2_5_0,9, precision and SGDClassifier


<contextlib.ExitStack at 0x28b0dc32350>

In [382]:
def prepare_section():
    header = '\\begin{figure}[h]\n' \
                '\t\\centering\n'
    i = 0
    body = ''
    for base in BASE:
        for s in STREAMS:
            for m in METRICS:
                current = '\t\\subfloat' + f'[{base} dla {METRICS_LATEX[m]} i {STREAMS_LATEX[s]}]' + '{\n' \
                '\t\t\\includesvg[width=.45\\textwidth]{imgs/cd-' + f'{STREAMS_NAMES[s]}-{METRIC_SS[m]}-{BASE_SHORT[base]}.svg' + '}\n' \
                '\t}'
                if i%2 == 1:
                    current += '\\hspace{0pt}'
                i+= 1
                current += '\n'
                body += current
                current = ''
    print(header)
    print(body)

prepare_section()

\begin{figure}[h]
	\centering

	\subfloat[GaussianNB dla TPR i CDI\textsubscript{1}]{
		\includesvg[width=.45\textwidth]{imgs/cd-CDI1-TPR-GNB.svg}
	}
	\subfloat[GaussianNB dla TNR i CDI\textsubscript{1}]{
		\includesvg[width=.45\textwidth]{imgs/cd-CDI1-TNR-GNB.svg}
	}\hspace{0pt}
	\subfloat[GaussianNB dla PPV i CDI\textsubscript{1}]{
		\includesvg[width=.45\textwidth]{imgs/cd-CDI1-PPV-GNB.svg}
	}
	\subfloat[GaussianNB dla BAC i CDI\textsubscript{1}]{
		\includesvg[width=.45\textwidth]{imgs/cd-CDI1-BAC-GNB.svg}
	}\hspace{0pt}
	\subfloat[GaussianNB dla F\textsubscript{1} i CDI\textsubscript{1}]{
		\includesvg[width=.45\textwidth]{imgs/cd-CDI1-F1-GNB.svg}
	}
	\subfloat[GaussianNB dla G\textsubscipt{mean} i CDI\textsubscript{1}]{
		\includesvg[width=.45\textwidth]{imgs/cd-CDI1-Gmean-GNB.svg}
	}\hspace{0pt}
	\subfloat[GaussianNB dla TPR i CDI\textsubscript{2}]{
		\includesvg[width=.45\textwidth]{imgs/cd-CDI2-TPR-GNB.svg}
	}
	\subfloat[GaussianNB dla TNR i CDI\textsubscript{2}]{
		\includesv

In [383]:
p = all_data.drop('rst', axis=1).set_index('stream').rename(columns=METRICS_LATEX, index=STREAMS_NAMES).reset_index()\
    .groupby(by=['base', 'stream', 'ensemble']).mean().to_latex(float_format="%.3f")
print(p)

\begin{tabular}{lllrrrrrr}
\toprule
 &  &  & TPR & TNR & PPV & BAC & F\textsubscript{1} & G\textsubscipt{mean} \\
base & stream & ensemble &  &  &  &  &  &  \\
\midrule
\multirow[t]{15}{*}{GaussianNB} & \multirow[t]{5}{*}{CDI1} & OB & 0.776 & 0.890 & 0.833 & 0.833 & 0.798 & 0.825 \\
 &  & ONSBoost & 0.813 & 0.836 & 0.852 & 0.825 & 0.824 & 0.804 \\
 &  & OOB & 0.843 & 0.853 & 0.777 & 0.848 & 0.794 & 0.846 \\
 &  & OnlineBoosting & 0.803 & 0.852 & 0.850 & 0.828 & 0.820 & 0.810 \\
 &  & UOB & 0.866 & 0.829 & 0.748 & 0.847 & 0.780 & 0.845 \\
\cline{2-9}
 & \multirow[t]{5}{*}{CDI2} & OB & 0.804 & 0.889 & 0.845 & 0.847 & 0.819 & 0.842 \\
 &  & ONSBoost & 0.837 & 0.850 & 0.860 & 0.844 & 0.845 & 0.833 \\
 &  & OOB & 0.856 & 0.853 & 0.797 & 0.855 & 0.815 & 0.853 \\
 &  & OnlineBoosting & 0.822 & 0.869 & 0.864 & 0.845 & 0.838 & 0.835 \\
 &  & UOB & 0.872 & 0.838 & 0.780 & 0.855 & 0.809 & 0.854 \\
\cline{2-9}
 & \multirow[t]{5}{*}{DISCO1} & OB & 0.722 & 0.867 & 0.802 & 0.795 & 0.750 & 0.782 \\
 &

In [394]:
def calc_table_comparison(df: pd.DataFrame, metric: str):
    # q_alpha taken from: https://plos.figshare.com/articles/dataset/Critical_values_for_the_two-tailed_Nemenyi_test_after_the_Friedman_test_/5434030/1
    _Q_ALPHA = 2.728
    _cd_value = Q_ALPHA * np.sqrt(len(ENSEMBLES) * (len(ENSEMBLES) + 1) / 6 / len(RST))
    _ALPHA = 0.05
    a = []
    for n, ens in enumerate(ENSEMBLES):
        for i, stream in enumerate(STREAMS):
            for k, base in enumerate(BASE):
                tmp = dict(stream=STREAMS_LATEX[stream],
                           base=base,
                           ensemble=ENAME_RANKS[ens],
                           metric=METRICS_LATEX[metric])
                _means = calc_mean_scores(df, stream, metric, base)
                ph = posthoc_nemenyi_friedman(_means).to_numpy()
                ph_sig = ph <= _ALPHA
                all_mean = _means.mean(axis=0)
                better_than = []
                for o in range(len(ENSEMBLES)):
                    if ph_sig[n, o] and all_mean[n] > all_mean[o]:
                        better_than.append(o+1)
                if better_than:
                    better_s = ', '.join([str(v) for v in np.unique(better_than)])
                else:
                    better_s = '-'
                val = '-' if np.isnan(all_mean[n]) else f'{all_mean[n]:.3f}'
                tmp['value'] = '$\\underset{' + better_s + '}{' + val + '}$'

                a.append(tmp)
    result = pd.DataFrame(a).set_index(keys=['metric', 'stream', 'base', 'ensemble'])
    return result

pdd = calc_table_comparison(all_data, METRICS[0])
for metric in METRICS[1:]:
    pdd = pd.concat([pdd, calc_table_comparison(all_data, metric)])


In [395]:
s = pdd.reset_index()
s

Unnamed: 0,metric,stream,base,ensemble,value
0,TPR,CDI\textsubscript{1},GaussianNB,ONS\textsubscript{1},$\underset{-}{0.813}$
1,TPR,CDI\textsubscript{1},SGDClassifier,ONS\textsubscript{1},$\underset{-}{0.790}$
2,TPR,CDI\textsubscript{2},GaussianNB,ONS\textsubscript{1},$\underset{-}{0.837}$
3,TPR,CDI\textsubscript{2},SGDClassifier,ONS\textsubscript{1},$\underset{-}{0.814}$
4,TPR,DISCO\textsubscript{1},GaussianNB,ONS\textsubscript{1},$\underset{2}{0.820}$
...,...,...,...,...,...
175,G\textsubscipt{mean},CDI\textsubscript{1},SGDClassifier,UOB\textsubscript{5},"$\underset{1, 2, 3}{0.847}$"
176,G\textsubscipt{mean},CDI\textsubscript{2},GaussianNB,UOB\textsubscript{5},$\underset{1}{0.854}$
177,G\textsubscipt{mean},CDI\textsubscript{2},SGDClassifier,UOB\textsubscript{5},"$\underset{1, 2, 3}{0.858}$"
178,G\textsubscipt{mean},DISCO\textsubscript{1},GaussianNB,UOB\textsubscript{5},"$\underset{1, 2, 3, 4}{0.828}$"


In [398]:
def method_name(_y):
    if _y in list(STREAMS_LATEX.values()):
        return list(STREAMS_LATEX.values()).index(_y)
    elif _y in list(METRICS_LATEX.values()):
        return list(METRICS_LATEX.values()).index(_y)
    else:
        return _y


v = s.reset_index()\
    .set_index('stream')\
    .sort_values(by=['base', 'ensemble'])\
    .reset_index()\
    .pivot(index=['metric', 'stream'], columns=['base', 'ensemble'], values='value')\
    .sort_index(key=lambda _x: _x.map(lambda _y: method_name(_y)))\
    .sort_index(axis=1, key=lambda _x: _x.map(lambda _y: BASE.index(_y) if _y in BASE else ENAME_RANKS_ORDER.index(_y)))\
    .to_latex(multicolumn_format='c', )
print(v)
v

\begin{tabular}{llllllllllll}
\toprule
 & base & \multicolumn{5}{c}{GaussianNB} & \multicolumn{5}{c}{SGDClassifier} \\
 & ensemble & ONS\textsubscript{1} & OBa\textsubscript{2} & OBo\textsubscript{3} & OOB\textsubscript{4} & UOB\textsubscript{5} & ONS\textsubscript{1} & OBa\textsubscript{2} & OBo\textsubscript{3} & OOB\textsubscript{4} & UOB\textsubscript{5} \\
metric & stream &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{3}{*}{TPR} & CDI\textsubscript{1} & $\underset{-}{0.813}$ & $\underset{-}{0.776}$ & $\underset{-}{0.803}$ & $\underset{2}{0.843}$ & $\underset{1, 2, 3}{0.866}$ & $\underset{-}{0.790}$ & $\underset{-}{0.801}$ & $\underset{-}{0.783}$ & $\underset{1, 3}{0.846}$ & $\underset{1, 2, 3}{0.851}$ \\
 & CDI\textsubscript{2} & $\underset{-}{0.837}$ & $\underset{-}{0.804}$ & $\underset{-}{0.822}$ & $\underset{-}{0.856}$ & $\underset{2, 3}{0.872}$ & $\underset{-}{0.814}$ & $\underset{-}{0.826}$ & $\underset{-}{0.804}$ & $\underset{3}{0.858}$ & $\underset{1, 2, 3}{0.861}$ 

'\\begin{tabular}{llllllllllll}\n\\toprule\n & base & \\multicolumn{5}{c}{GaussianNB} & \\multicolumn{5}{c}{SGDClassifier} \\\\\n & ensemble & ONS\\textsubscript{1} & OBa\\textsubscript{2} & OBo\\textsubscript{3} & OOB\\textsubscript{4} & UOB\\textsubscript{5} & ONS\\textsubscript{1} & OBa\\textsubscript{2} & OBo\\textsubscript{3} & OOB\\textsubscript{4} & UOB\\textsubscript{5} \\\\\nmetric & stream &  &  &  &  &  &  &  &  &  &  \\\\\n\\midrule\n\\multirow[t]{3}{*}{TPR} & CDI\\textsubscript{1} & $\\underset{-}{0.813}$ & $\\underset{-}{0.776}$ & $\\underset{-}{0.803}$ & $\\underset{2}{0.843}$ & $\\underset{1, 2, 3}{0.866}$ & $\\underset{-}{0.790}$ & $\\underset{-}{0.801}$ & $\\underset{-}{0.783}$ & $\\underset{1, 3}{0.846}$ & $\\underset{1, 2, 3}{0.851}$ \\\\\n & CDI\\textsubscript{2} & $\\underset{-}{0.837}$ & $\\underset{-}{0.804}$ & $\\underset{-}{0.822}$ & $\\underset{-}{0.856}$ & $\\underset{2, 3}{0.872}$ & $\\underset{-}{0.814}$ & $\\underset{-}{0.826}$ & $\\underset{-}{0.804}$ & 

In [387]:
pdd

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,value
metric,stream,base,ensemble,Unnamed: 4_level_1
TPR,CDI\textsubscript{1},GaussianNB,ONS\textsubscript{1},$\underset{-}{0.813}$
TPR,CDI\textsubscript{1},SGDClassifier,ONS\textsubscript{1},$\underset{-}{0.790}$
TPR,CDI\textsubscript{2},GaussianNB,ONS\textsubscript{1},$\underset{-}{0.837}$
TPR,CDI\textsubscript{2},SGDClassifier,ONS\textsubscript{1},$\underset{-}{0.814}$
TPR,DISCO\textsubscript{1},GaussianNB,ONS\textsubscript{1},$\underset{2}{0.820}$
...,...,...,...,...
G\textsubscipt{mean},CDI\textsubscript{1},SGDClassifier,UOB\textsubscript{5},"$\underset{1, 2, 3}{0.847}$"
G\textsubscipt{mean},CDI\textsubscript{2},GaussianNB,UOB\textsubscript{5},$\underset{1}{0.854}$
G\textsubscipt{mean},CDI\textsubscript{2},SGDClassifier,UOB\textsubscript{5},"$\underset{1, 2, 3}{0.858}$"
G\textsubscipt{mean},DISCO\textsubscript{1},GaussianNB,UOB\textsubscript{5},"$\underset{1, 2, 3, 4}{0.828}$"


In [388]:
x = np.array([
    [False, True, True],
    [True, False, False],
    [True, False, False]
])
sc = np.array([
    [0, .5, .3],
    []
])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.