In [10]:
%load_ext autoreload
%autoreload 2
from datetime import datetime
import os

import deltas
from deltas.pipeline import data, classifier, evaluation
from deltas.model import downsample
from tqdm import tqdm

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
# np.random.seed(10)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:

costs = (1, 1)  # change for (1, 10) to increase results
dataset = 'MIMIC-III-mortality'
# model = 'SVM-rbf'
# model = 'MLP-deep'
model = 'MIMIC'

dfs = []
len_required = 10
for i in tqdm(range(len_required*10)):
    # np.random.seed(random.randint)
    # np.random.seed(i)
    data_clf = data.get_real_dataset(dataset, _print=False, seed=i, scale=True)
    # data_clf = data.get_real_dataset(dataset, _print=False, scale=True)
    classifiers_dict = classifier.get_classifier(
        data_clf=data_clf,
        model=model,
        _plot=False,
        _print=False)
    data_clf['clf'] = classifiers_dict['Baseline']
    X = data_clf['data']['X']
    y = data_clf['data']['y']
    clf = data_clf['clf']
    # deltas_model = downsample.downsample_deltas(
    #     clf).fit(X, y, _print=True, _plot=True, max_trials=10000)
    # deltas_model = base.base_deltas(
    #     clf).fit(X, y, grid_search=True, _print=True, _plot=True)
    if False:
        param_grid = {
                    #   'alpha': [0, 0.1, 1, 10],
                    #   'grid_search': [True, False],
                    'method': ['supports-prop-update_mean', 'supports-prop-update_mean-margin_only']}
        grid_original = GridSearchCV(
            downsample.downsample_deltas(), param_grid, refit=True)
        grid_original.fit(X, y,
                        clf=clf,
                        _print=False,
                        _plot=False,
                        max_trials=10000,
                        parallel=True)
        deltas_model = grid_original.best_estimator_
        print(f'Best params: {grid_original.best_params_}')
    else:
        deltas_model = downsample.downsample_deltas(clf).fit(X, y,
                                                             alpha=1,
                                                             _print=False,
                                                             _plot=False,
                                                             max_trials=10000,
                                                             parallel=True)

    if deltas_model.is_fit == True:
        classifiers_dict['Our Method'] = deltas_model
        scores_df = evaluation.eval_test(classifiers_dict,
                                         data_clf['data_test'], _print=False, _plot=False)
        dfs.append(scores_df)
    print(len(dfs))
    # else:
    #     print('not fit deltas')
    if len(dfs) == len_required:
        break

  1%|          | 1/100 [37:29<61:51:14, 2249.24s/it]

1


  2%|▏         | 2/100 [1:14:45<61:01:10, 2241.54s/it]

2


  3%|▎         | 3/100 [1:52:13<60:28:44, 2244.58s/it]

3


In [None]:
print(len(dfs))

In [None]:
df = pd.concat(dfs, axis=0)
mean = {}
std = {}
index = df.index.unique().to_list()
cols = df.columns.to_list()
for method in index:
    mean[method] = df.loc[method].mean().to_list()
    std[method] = df.loc[method].std().to_list()

mean_df = pd.DataFrame.from_dict(mean, orient='index', columns=cols)
std_df = pd.DataFrame.from_dict(std, orient='index', columns=cols)
print(f'% {dataset} - {len_required}')

m = mean_df.to_dict('list')
s = std_df.to_dict('list')
metrics = mean_df.columns.to_list()
methods = mean_df.index.to_list()
sf = 5
for metric in metrics:
    means = m[metric]
    sts = s[metric]
    mx = np.argmax(means)
    for i in range(len(means)):
        m_str = str(means[i])[1:sf]
        if i == mx:
            m_str = f"\\textbf{{{m_str}}}"
        s_str = str(sts[i])[1:sf-1]
        m[metric][i] = f'${m_str} \\pm {s_str}$'

method_map = {
    'Baseline': 'Baseline',
    'SMOTE': "SMOTE \cite{Chawla_2002_JAIR}",
    'Balanced Weights': 'BW',
    'BMR': 'BMR \cite{Bahnsen_2014_SIAM}',
    'Threshold': 'Thresh \cite{Sheng_2006_AAAI}',
    'Our Method': 'Our Method',
}
meths_new = []
for me in methods:
    meths_new.append(method_map[me])
m['Methods'] = meths_new
df = pd.DataFrame(m)  # .set_index('Methods')
meths = df.pop('Methods')
df.insert(0, 'Methods', meths)
latex_str = df.to_latex(index=False)

# print('\\begin{tabular}{@{}lrrrrr@{}}'+latex_str[23:])
print(mean_df)

In [None]:
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
save_file = os.path.join(os.path.dirname(
    os.path.abspath('')), 'notebooks-ECAI', 'results', f'Results-5-{dataset}.txt')
with open(save_file, "w") as text_file:
    text_file.write(f'% {dt_string}\n')
    text_file.write(f'% {dataset} - {len_required} runs {model} model\n')
    train = np.unique(data_clf['data']['y'], return_counts=True)[1]
    test = np.unique(data_clf['data_test']['y'], return_counts=True)[1]
    text_file.write(f'% train:{train}, test:{test}\n')
    format = '\\begin{tabular}{@{}l' + 'c'*len(mean_df.columns) + '@{}}'
    text_file.write(format+latex_str[18+len(mean_df.columns):])