In [67]:
import itertools
import os

import numpy as np
from scipy.stats import ttest_ind

In [68]:

DDI_SCORES_PATH = "D:\\JavaTraining\\mgr\\ensemble_size\\ddi"
columns = [3, 5, 10, 20, 30, 40, 50, 75, 100]

In [69]:
ALPHA = .1


def calculate_tables(_data, _alpha):
    _size = _data.shape[0]
    _table_shape = (_size, _size)
    _tt = np.zeros(_table_shape)
    _p_val = np.zeros(_table_shape)
    _alpha_p = np.zeros(_table_shape)
    for _i in range(_size):
        for _j in range(_size):
            _tt[_i, _j], _p_val[_i, _j] = ttest_ind(_data[_i], _data[_j])
    _alpha_p = _p_val <= _alpha
    return _tt, _p_val, _alpha_p


In [70]:
def join(_x, _delim=''):
    if _x is np.ndarray:
        return _delim.join(_x.tolist())
    return _delim.join([str(__x) for __x in _x])


def print_formatted_tbl(tbl, row_headers=None, conditional_fmt=None, col_headers=None):
    _rows = tbl.shape[0]
    _columns = tbl.shape[1]
    if conditional_fmt is None:
        conditional_fmt = []
    _result = ''
    if col_headers is not None:
        if row_headers is not None:
            _result += '\\cline{2-' + str(_columns + 1) + '} \n \\multicolumn{1}{c|}{ } & '
        else:
            _result += '\\hline \\\\ \n'
        _result += join(col_headers, ' & ') + ' \\\\ \\hline \n'

    for _i in range(_rows):
        if row_headers is not None:
            _result += str(row_headers[_i]) + ' & '
        _row_col = tbl[_i].copy()
        for _fmt in conditional_fmt:
            _row_col = np.vectorize(_fmt)(_row_col)
        _result += join(_row_col, ' & ') + ' \\\\ \\hline \n'

    return _result


formatters = [
    lambda x: '\\cellcolor{green!25}{' + '{:.3f}'.format(x) + '}'
        if x <= ALPHA
        else '\\cellcolor{red!25}{' + '{:.3f}'.format(x) + '}'
]


In [71]:
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import os

RANDOM_STATES = [1000, 100000, 101010,
                 10110, 101101, 1001,
                 10101010, 101, 110, 1337]
ENSEMBLE_SIZE = [3, 5, 10, 20, 30, 40, 50, 75, 100]
CLFS = [GaussianNB, MLPClassifier]

def process_data_for_clf(_clf):
    _result = { n: np.array([]) for n in ENSEMBLE_SIZE }
    for file in os.listdir('D:\\projects\\mgr\\ensemble_size\\ddi\\'):
        if file.startswith(str(_clf)):
            filepath = os.path.join('D:\\projects\\mgr\\ensemble_size\\ddi\\', file)
            if os.path.isfile(filepath):
                _data = np.load(filepath)
                _n = np.int32(file.split('_')[1])
                _result[_n] = np.append(_result[_n], np.mean(_data, axis=1))
    return _result

def calculate_tables_more_data(_data, _alpha):
    _size = len(ENSEMBLE_SIZE)
    _table_shape = (_size, _size)
    _tt = np.zeros(_table_shape)
    _p_val = np.zeros(_table_shape)
    _alpha_p = np.zeros(_table_shape)
    for _i, _n in enumerate(ENSEMBLE_SIZE):
        for _j, __n in enumerate(ENSEMBLE_SIZE):
            _tt[_i, _j], _p_val[_i, _j] = ttest_ind(_data[_n], _data[__n])
    _alpha_p = _p_val <= _alpha
    return _tt, _p_val, _alpha_p

tt_gnb, p_val_gnb, alpha_p_gnb = calculate_tables_more_data(process_data_for_clf(GaussianNB()), ALPHA)
tt_mlp, p_val_mlp, alpha_p_mlp = calculate_tables_more_data(process_data_for_clf(MLPClassifier()), ALPHA)

In [72]:
formatted_tbl_gnb = print_formatted_tbl(
    p_val_gnb,
    row_headers=ENSEMBLE_SIZE,
    col_headers=ENSEMBLE_SIZE,
    conditional_fmt=formatters)
print(formatted_tbl_gnb)

\cline{2-10} 
 \multicolumn{1}{c|}{ } & 3 & 5 & 10 & 20 & 30 & 40 & 50 & 75 & 100 \\ \hline 
3 & \cellcolor{red!25}{1.000} & \cellcolor{red!25}{0.747} & \cellcolor{red!25}{0.951} & \cellcolor{red!25}{0.860} & \cellcolor{red!25}{0.518} & \cellcolor{red!25}{0.302} & \cellcolor{red!25}{0.223} & \cellcolor{red!25}{0.137} & \cellcolor{green!25}{0.092} \\ \hline 
5 & \cellcolor{red!25}{0.747} & \cellcolor{red!25}{1.000} & \cellcolor{red!25}{0.785} & \cellcolor{red!25}{0.592} & \cellcolor{red!25}{0.306} & \cellcolor{red!25}{0.163} & \cellcolor{red!25}{0.113} & \cellcolor{green!25}{0.063} & \cellcolor{green!25}{0.043} \\ \hline 
10 & \cellcolor{red!25}{0.951} & \cellcolor{red!25}{0.785} & \cellcolor{red!25}{1.000} & \cellcolor{red!25}{0.802} & \cellcolor{red!25}{0.460} & \cellcolor{red!25}{0.259} & \cellcolor{red!25}{0.186} & \cellcolor{red!25}{0.110} & \cellcolor{green!25}{0.074} \\ \hline 
20 & \cellcolor{red!25}{0.860} & \cellcolor{red!25}{0.592} & \cellcolor{red!25}{0.802} & \cellcolor{red

In [73]:
formatted_tbl_mlp = print_formatted_tbl(
    p_val_mlp,
    row_headers=ENSEMBLE_SIZE,
    col_headers=ENSEMBLE_SIZE,
    conditional_fmt=formatters)
print(formatted_tbl_mlp)

\cline{2-10} 
 \multicolumn{1}{c|}{ } & 3 & 5 & 10 & 20 & 30 & 40 & 50 & 75 & 100 \\ \hline 
3 & \cellcolor{red!25}{1.000} & \cellcolor{red!25}{0.358} & \cellcolor{red!25}{0.184} & \cellcolor{red!25}{0.102} & \cellcolor{green!25}{0.063} & \cellcolor{green!25}{0.056} & \cellcolor{green!25}{0.053} & \cellcolor{green!25}{0.060} & \cellcolor{green!25}{0.044} \\ \hline 
5 & \cellcolor{red!25}{0.358} & \cellcolor{red!25}{1.000} & \cellcolor{red!25}{0.666} & \cellcolor{red!25}{0.433} & \cellcolor{red!25}{0.302} & \cellcolor{red!25}{0.285} & \cellcolor{red!25}{0.269} & \cellcolor{red!25}{0.296} & \cellcolor{red!25}{0.234} \\ \hline 
10 & \cellcolor{red!25}{0.184} & \cellcolor{red!25}{0.666} & \cellcolor{red!25}{1.000} & \cellcolor{red!25}{0.713} & \cellcolor{red!25}{0.533} & \cellcolor{red!25}{0.514} & \cellcolor{red!25}{0.490} & \cellcolor{red!25}{0.527} & \cellcolor{red!25}{0.437} \\ \hline 
20 & \cellcolor{red!25}{0.102} & \cellcolor{red!25}{0.433} & \cellcolor{red!25}{0.713} & \cellcolor{r