In [1]:
%load_ext autoreload
%autoreload 2
from datetime import datetime

from deltas.pipeline import data, classifier, evaluation
from deltas.model import base, downsample
import deltas.plotting.plots as plots
from tqdm import tqdm

import numpy as np
import pandas as pd
# np.random.seed(10)


In [2]:

N1 = 1000
N2 = 10
m = 1
v = 1
costs = (1, 1)  # change for (1, 10) to increase results
# Gaussian (not always seperable)
dfs = []
len_required = 10
for i in tqdm(range(len_required*10)):
    data_clf = data.get_data(
        m1=[-m, -m],
        m2=[m, m],
        cov1=[[v, 0], [0, v]],
        cov2=[[v, 0], [0, v]],
        N1=N1,
        N2=N2,
        scale=False,
        test_nums=[10000, 10000],
    )

    model = 'SVM-linear'
    model = 'SVM-rbf'
    # model = 'Linear'
    model = 'MLP'
    # model = 'MLP-Gaussian'

    classifiers_dict = classifier.get_classifier(
        data_clf=data_clf,
        model=model,
        _plot=False)
    data_clf['clf'] = classifiers_dict['Baseline']
    X = data_clf['data']['X']
    y = data_clf['data']['y']
    clf = data_clf['clf']
    # deltas_model = downsample.downsample_deltas(
    #     clf).fit(X, y, _print=True, _plot=True, max_trials=10000)
    # deltas_model = base.base_deltas(
    #     clf).fit(X, y, grid_search=True, _print=True, _plot=True)
    deltas_model = downsample.downsample_deltas(clf).fit(X, y,
                                                         alpha=1,
                                                         _print=False,
                                                         _plot=False,
                                                         method='supports-prop-update_mean',
                                                         max_trials=10000,
                                                         parallel=True)

    if deltas_model.is_fit == True:
        classifiers_dict['Our Method'] = deltas_model
        scores_df = evaluation.eval_test(classifiers_dict,
                                         data_clf['data_test'], _print=False, _plot=False)
        dfs.append(scores_df)
    else:
        print('not fit deltas')
    if len(dfs) == len_required:
        break

  9%|▉         | 9/100 [02:35<26:14, 17.30s/it]  


In [3]:
print(len(dfs))

10


In [4]:
df = pd.concat(dfs, axis=0)
mean = {}
std = {}
index = df.index.unique().to_list()
cols = df.columns.to_list()
for method in index:
    mean[method] = df.loc[method].mean().to_list()
    std[method] = df.loc[method].std().to_list()

mean_df = pd.DataFrame.from_dict(mean, orient='index', columns=cols)
print(mean_df)
std_df = pd.DataFrame.from_dict(std, orient='index', columns=cols)

m = mean_df.to_dict('list')
s = std_df.to_dict('list')
metrics = mean_df.columns.to_list()
methods = mean_df.index.to_list()
sf = 5
for metric in metrics:
    means = m[metric]
    sts = s[metric]
    mx = np.argmax(means)
    for i in range(len(means)):
        m_str = str(means[i])[1:sf]
        if i == mx:
            m_str = f"\\textbf{{{m_str}}}"
        s_str = str(sts[i])[1:sf-1]
        m[metric][i] = f'${m_str} \\pm {s_str}$'

method_map = {
    'Baseline': 'Baseline',
    'SMOTE': "SMOTE \cite{Chawla_2002_JAIR}",
    'Balanced Weights': 'BW',
    'BMR': 'BMR \cite{Bahnsen_2014_SIAM}',
    'Threshold': 'Thresh \cite{Sheng_2006_AAAI}',
    'Our Method': 'Our Method',
}
meths_new = []
for me in methods:
    meths_new.append(method_map[me])
m['Methods'] = meths_new
df = pd.DataFrame(m)  # .set_index('Methods')
meths = df.pop('Methods')
df.insert(0, 'Methods', meths)
latex_str = df.to_latex(index=False)

print(f'\n% Gaussian - {len_required}')
print('\\begin{tabular}{@{}lrrrr@{}}'+latex_str[22:])

                  Accuracy    G-Mean   ROC-AUC        F1
Baseline          0.501850  0.032655  0.974174  0.007300
SMOTE             0.905435  0.903718  0.975846  0.900115
Balanced Weights  0.626035  0.400038  0.943593  0.324821
BMR               0.879305  0.872430  0.974174  0.865805
Threshold         0.879305  0.872430  0.974174  0.865805
Our Method        0.909740  0.908789  0.974174  0.907456

% Gaussian - 10
\begin{tabular}{@{}lrrrr@{}}
\toprule
Methods & Accuracy & G-Mean & ROC-AUC & F1 \\
\midrule
Baseline & $.501 \pm .00$ & $.032 \pm .05$ & $.974 \pm .00$ & $.007 \pm .01$ \\
SMOTE \cite{Chawla_2002_JAIR} & $.905 \pm .01$ & $.903 \pm .01$ & $\textbf{.975} \pm .00$ & $.900 \pm .01$ \\
BW & $.626 \pm .15$ & $.400 \pm .32$ & $.943 \pm .05$ & $.324 \pm .35$ \\
BMR \cite{Bahnsen_2014_SIAM} & $.879 \pm .04$ & $.872 \pm .05$ & $.974 \pm .00$ & $.865 \pm .06$ \\
Thresh \cite{Sheng_2006_AAAI} & $.879 \pm .04$ & $.872 \pm .05$ & $.974 \pm .00$ & $.865 \pm .06$ \\
Our Method & $\textbf{.909

In [6]:
now = datetime.now()
with open(f"Results-Gaussian.txt", "w") as text_file:
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    text_file.write(dt_string)
    text_file.write(f'\n% Gaussian - {len_required}')
    text_file.write('\\begin{tabular}{@{}lrrrr@{}}'+latex_str[22:])