In [1]:
%load_ext autoreload
%autoreload 2
from datetime import datetime
import os

import deltas
from deltas.pipeline import data, classifier, evaluation
from deltas.model import downsample
from tqdm import tqdm

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
# np.random.seed(10)


In [2]:
datasets = {0: 'Breast Cancer', 2: 'Iris', 3: 'Wine', 4: 'Pima Indian Diabetes',
            5: 'Sonar Rocks vs Mines', 6: 'Banknote Authentication',
            7: 'Abalone Gender', 8: 'Ionosphere', 9: 'Wheat Seeds',
            10: 'Credit Scoring 1', 11: 'Credit Scoring 2',
            12: 'Direct Marketing', 13: 'Habermans breast cancer',
            14: 'Wisconsin Breast Cancer', 15: 'Hepatitis',
            16: 'Heart Disease'}

dataset = datasets[4]  # change ind to select dataset to use
model = 'SVM-rbf'
# model = 'MLP'
# model = 'Linear'

# dataset = datasets[7]  # change ind to select dataset to use
dfs = []
len_required = 10
for i in tqdm(range(1, len_required*10)):
    data_clf = data.get_real_dataset(dataset, _print=False, seed=i, scale=True)

    classifiers_dict = classifier.get_classifier(
        data_clf=data_clf,
        model=model,
        _plot=False,
        _print=False)
    data_clf['clf'] = classifiers_dict['Baseline']
    X = data_clf['data']['X']
    y = data_clf['data']['y']
    clf = data_clf['clf']
    deltas_model = downsample.downsample_deltas(clf).fit(X, y,
                                                        _print=False,
                                                        _plot=False,
                                                        method='supports-prop-update_mean',
                                                        max_trials=10000,
                                                        parallel=True)

    if deltas_model.is_fit == True:
        classifiers_dict['Our Method'] = deltas_model
        scores_df = evaluation.eval_test(classifiers_dict,
                                         data_clf['data_test'], _print=False, _plot=False)
        dfs.append(scores_df)
        # print(len(dfs))
    if len(dfs) == len_required:
        break

  0%|          | 0/99 [00:00<?, ?it/s]

 34%|███▍      | 34/99 [02:22<04:31,  4.18s/it]


In [3]:
print(len(dfs))

10


In [4]:
df = pd.concat(dfs, axis=0)
mean = {}
std = {}
index = df.index.unique().to_list()
cols = df.columns.to_list()
for method in index:
    mean[method] = df.loc[method].mean().to_list()
    std[method] = df.loc[method].std().to_list()

mean_df = pd.DataFrame.from_dict(mean, orient='index', columns=cols)
std_df = pd.DataFrame.from_dict(std, orient='index', columns=cols)
print(f'% {dataset} - {len_required}')

m = mean_df.to_dict('list')
s = std_df.to_dict('list')
metrics = mean_df.columns.to_list()
methods = mean_df.index.to_list()
sf = 5
for metric in metrics:
    means = m[metric]
    sts = s[metric]
    mx = np.argmax(means)
    for i in range(len(means)):
        m_str = str(means[i])[1:sf]
        if i == mx:
            m_str = f"\\textbf{{{m_str}}}"
        s_str = str(sts[i])[1:sf-1]
        m[metric][i] = f'${m_str} \\pm {s_str}$'

method_map = {
    'Baseline': 'Baseline',
    'SMOTE': "SMOTE \cite{Chawla_2002_JAIR}",
    'Balanced Weights': 'BW',
    'BMR': 'BMR \cite{Bahnsen_2014_SIAM}',
    'Threshold': 'Thresh \cite{Sheng_2006_AAAI}',
    'Our Method': 'Our Method',
}
meths_new = []
for me in methods:
    meths_new.append(method_map[me])
m['Methods'] = meths_new
df = pd.DataFrame(m)  # .set_index('Methods')
meths = df.pop('Methods')
df.insert(0, 'Methods', meths)
latex_str = df.to_latex(index=False)

# print('\\begin{tabular}{@{}lrrrr@{}}'+latex_str[22:])
print(mean_df)

% Pima Indian Diabetes - 10
                  Accuracy    G-Mean        F1
Baseline          0.540974  0.181501  0.137286
SMOTE             0.595132  0.495203  0.396270
Balanced Weights  0.585598  0.476282  0.373921
BMR               0.661460  0.628755  0.575362
Threshold         0.661460  0.628755  0.575362
Our Method        0.673022  0.651837  0.611909


In [5]:
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
save_file = os.path.join(os.path.dirname(
    os.path.abspath('')), 'notebooks-ECAI', 'results', f'Results-1-{dataset}.txt')
with open(save_file, "w") as text_file:
    text_file.write(f'% {dt_string}\n')
    text_file.write(f'% {dataset} - {len_required} runs {model} model\n')
    train = np.unique(data_clf['data']['y'], return_counts=True)[1]
    test = np.unique(data_clf['data_test']['y'], return_counts=True)[1]
    text_file.write(f'% train:{train}, test:{test}\n')
    format = '\\begin{tabular}{@{}l' + 'c'*len(mean_df.columns) + '@{}}'
    text_file.write(format+latex_str[18+len(mean_df.columns):])