In [2]:
%load_ext autoreload
%autoreload 2
from datetime import datetime
import os

from deltas.pipeline import data, classifier, evaluation
from deltas.model import base, downsample
import deltas.plotting.plots as plots
from tqdm import tqdm

import numpy as np
import pandas as pd


In [3]:

N1 = 1000
N2 = 10
m = 1
v = 1
costs = (1, 1)  # change for (1, 10) to increase results
# Gaussian (not always seperable)
dfs = []
len_required = 10
for i in tqdm(range(len_required*10)):
    data_clf = data.get_data(
        m1=[-m, -m],
        m2=[m, m],
        cov1=[[v, 0], [0, v]],
        cov2=[[v, 0], [0, v]],
        N1=N1,
        N2=N2,
        scale=False,
        test_nums=[1000, 1000],
        seed=i
    )

    model = 'MLP'
    model = 'MLP-Gaussian'
    model = 'Linear'

    classifiers_dict = classifier.get_classifier(
        data_clf=data_clf,
        model=model,
        _plot=False)
    data_clf['clf'] = classifiers_dict['Baseline']
    X = data_clf['data']['X']
    y = data_clf['data']['y']
    clf = data_clf['clf']
    # deltas_model = downsample.downsample_deltas(
    #     clf).fit(X, y, _print=True, _plot=True, max_trials=10000)
    # deltas_model = base.base_deltas(
    #     clf).fit(X, y, grid_search=True, _print=True, _plot=True)
    deltas_model = downsample.downsample_deltas(clf).fit(X, y,
                                                         alpha=1,
                                                         _print=False,
                                                         _plot=False,
                                                         method='supports-prop-update_mean',
                                                         max_trials=10000,
                                                         parallel=True)

    if deltas_model.is_fit == True:
        classifiers_dict['Our Method'] = deltas_model
        scores_df = evaluation.eval_test(classifiers_dict,
                                         data_clf['data_test'], _print=False, _plot=False)
        dfs.append(scores_df)
    else:
        print('not fit deltas')
    if len(dfs) == len_required:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

  9%|▉         | 9/100 [00:54<09:12,  6.07s/it]


In [4]:
print(len(dfs))

10


In [5]:
df = pd.concat(dfs, axis=0)
mean = {}
std = {}
index = df.index.unique().to_list()
cols = df.columns.to_list()
for method in index:
    mean[method] = df.loc[method].mean().to_list()
    std[method] = df.loc[method].std().to_list()

mean_df = pd.DataFrame.from_dict(mean, orient='index', columns=cols)
std_df = pd.DataFrame.from_dict(std, orient='index', columns=cols)

m = mean_df.to_dict('list')
s = std_df.to_dict('list')
metrics = mean_df.columns.to_list()
methods = mean_df.index.to_list()
sf = 5
for metric in metrics:
    means = m[metric]
    sts = s[metric]
    mx = np.argmax(means)
    for i in range(len(means)):
        m_str = str(means[i])[1:sf]
        if i == mx:
            m_str = f"\\textbf{{{m_str}}}"
        s_str = str(sts[i])[1:sf-1]
        m[metric][i] = f'${m_str} \\pm {s_str}$'

method_map = {
    'Baseline': 'Baseline',
    'SMOTE': "SMOTE \cite{Chawla_2002_JAIR}",
    'Balanced Weights': 'BW',
    'BMR': 'BMR \cite{Bahnsen_2014_SIAM}',
    'Threshold': 'Thresh \cite{Sheng_2006_AAAI}',
    'Our Method': 'Our Method',
}
meths_new = []
for me in methods:
    meths_new.append(method_map[me])
m['Methods'] = meths_new
df = pd.DataFrame(m)  # .set_index('Methods')
meths = df.pop('Methods')
df.insert(0, 'Methods', meths)
latex_str = df.to_latex(index=False)

# print(f'\n% Gaussian - {len_required}')
# print('\\begin{tabular}{@{}lrrrr@{}}'+latex_str[22:])
print(mean_df)

                  Accuracy    G-Mean        F1
Baseline           0.66735  0.577502  0.500215
SMOTE              0.89630  0.892781  0.888119
Balanced Weights   0.90225  0.899872  0.896667
BMR                0.89255  0.887387  0.882261
Threshold          0.89275  0.887642  0.882569
Our Method         0.91125  0.909699  0.914652


In [6]:
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
save_file = os.path.join(os.path.dirname(
    os.path.abspath('')), 'notebooks-ECAI', 'results', f'Results-0-Gaussian.txt')
with open(save_file, "w") as text_file:
    text_file.write(f'% {dt_string}\n')
    text_file.write(f'% Gaussian - {len_required} runs {model} model\n')
    train = np.unique(data_clf['data']['y'], return_counts=True)[1]
    test = np.unique(data_clf['data_test']['y'], return_counts=True)[1]
    text_file.write(f'% train:{train}, test:{test}\n')
    format = '\\begin{tabular}{@{}l' + 'c'*len(mean_df.columns) + '@{}}'
    text_file.write(format+latex_str[18+len(mean_df.columns):])