In [4]:
! pip install tqdm

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
import copy

import xgboost as xgb
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score
from sklearn.model_selection import StratifiedKFold

In [6]:
d = pd.read_csv("results/attacks_diagnoses.csv")
d = d[(d["dataset"] != "mfeat-morphological") & (d["attack"] != "lpf")]

In [7]:
def q0(x: float) -> float:
    return x.quantile(0)


def q25(x: float) -> float:
    return x.quantile(0.25)


def q50(x: float) -> float:
    return x.quantile(0.5)


def q75(x: float) -> float:
    return x.quantile(0.75)


def q1(x: float) -> float:
    return x.quantile(1)


def minmax(x: float) -> float:
    return x.max() - x.min()

attrs_con = d
cols_to_drop = ["approx", "target", "pred", "error", "name",
                    "overall_mean_target", "scores",
                    "mean_target_in_neighborhood",
                    "mean_approx_in_neighborhood",
                    "neighborhood_size_div_model_avg",
                    "neighborhood_size_pct",
                    "r_centered_entropy",
                "entropy",
                    "logk_r_centered_entropy"]

attrs_cols = attrs_con.drop(columns=cols_to_drop)
attrs_agg = attrs_cols \
    .groupby(["dataset", "model", "attack", "bacc_test",
              "n_test", "n_classes"]) \
    .agg(['mean', q0, q25, q50, q75, q1, minmax])
attrs_agg_correct_cols = copy.deepcopy(attrs_agg)
attrs_agg_correct_cols.columns = list(attrs_agg_correct_cols.columns.map('_'.join))
attrs_agg_correct_cols = attrs_agg_correct_cols.reset_index()

In [8]:
d_nn = pd.read_csv("results/attacks_diagnoses_nn.csv")

attrs_cols = d_nn.drop(columns=cols_to_drop)
attrs_agg = attrs_cols \
    .groupby(["dataset", "model", "attack", "bacc_test",
              "n_test", "n_classes"]) \
    .agg(['mean', q0, q25, q50, q75, q1, minmax])
attrs_agg_correct_cols_nn = copy.deepcopy(attrs_agg)
attrs_agg_correct_cols_nn.columns = list(attrs_agg_correct_cols_nn.columns.map('_'.join))
attrs_agg_correct_cols_nn = attrs_agg_correct_cols_nn.reset_index()

In [9]:
attrs_agg_correct_cols = pd.concat([attrs_agg_correct_cols, attrs_agg_correct_cols_nn])

In [10]:
np.unique(attrs_agg_correct_cols["attack"], return_counts=True)

(array(['bim', 'fgm', 'hsj', 'noise', 'org', 'per', 'pgd', 'zoo'],
       dtype=object),
 array([22, 22, 64, 22, 88, 66, 22, 66]))

In [11]:
data_counts = attrs_agg_correct_cols[["dataset", "model", "attack"]].groupby(["dataset", "model"]).size().reset_index(name='counts')
data_counts

Unnamed: 0,dataset,model,counts
0,Bioresponse,lin,4
1,Bioresponse,nn,5
2,Bioresponse,svm,4
3,Bioresponse,xgb,4
4,churn,lin,4
...,...,...,...
83,wdbc,xgb,4
84,wilt,lin,4
85,wilt,nn,5
86,wilt,svm,4


In [12]:
data_counts_model_type = attrs_agg_correct_cols[["dataset", "model", "attack"]].groupby(["dataset", "attack"]).size().reset_index(name='counts')
data_counts_model_type

Unnamed: 0,dataset,attack,counts
0,Bioresponse,bim,1
1,Bioresponse,fgm,1
2,Bioresponse,hsj,3
3,Bioresponse,noise,1
4,Bioresponse,org,4
...,...,...,...
171,wilt,noise,1
172,wilt,org,4
173,wilt,per,3
174,wilt,pgd,1


In [13]:
le = LabelEncoder()
x_train = attrs_agg_correct_cols.drop(columns=["dataset", "model", "attack"])
y_train = attrs_agg_correct_cols["attack"]
y_train_enc = le.fit_transform(y_train)
model = RandomForestClassifier(random_state=123)
model.fit(x_train, y_train_enc)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
fi_dict = {'var': x_train.columns, 'fi': model.feature_importances_}
fi_df = pd.DataFrame(fi_dict)
fi_df["fi_rank"] = fi_df["fi"].rank(ascending=False)

In [15]:
attrs_agg_correct_cols.to_csv("results/attr_attacks_type_agr_nn.csv", index=False)

In [16]:
def create_hyperparams_grid(model, param_grid, exp_function, crit="kappa"):

    quality_measure = []
    params_list = []
    results_table = []
    results_fi = []

    for param1 in param_grid[list(param_grid.keys())[0]]:
        for param2 in param_grid[list(param_grid.keys())[1]]:
            for param3 in param_grid[list(param_grid.keys())[2]]:
                params = {list(param_grid.keys())[0]: param1,
                          list(param_grid.keys())[1]: param2,
                          list(param_grid.keys())[2]: param3}

                model.set_params(**params)
                summary, fi = exp_function(model=model)
                results_table.append(summary)
                results_fi.append(fi)
                params_list.append(params)
                crit_value = np.mean(summary["kappa"])
                quality_measure.append(crit_value)
                print(f"{crit} = {crit_value}, params: {params}")

    best_params_idx = np.argmax(quality_measure)
    best_params = params_list[best_params_idx]

    print(f"Params: {best_params} give best {crit} equal {quality_measure[best_params_idx]}")

    return results_table[best_params_idx], results_fi[best_params_idx]

# Leave-one-data-set-out

In [21]:
from sklearn.metrics import confusion_matrix

In [None]:
def leave_dataset_out(model=xgb.XGBClassifier(random_state=123)):

    datasets = np.unique(attrs_agg_correct_cols["dataset"])
    kappa_list = []
    bacc_list = []
    fi_all = pd.DataFrame()
    test_all = []
    preds_all = []

    for dataset in tqdm(datasets):

        train = attrs_agg_correct_cols[attrs_agg_correct_cols["dataset"] != dataset]
        test = attrs_agg_correct_cols[attrs_agg_correct_cols["dataset"] == dataset]

        x_train = train.drop(columns=["dataset", "model", "attack"])
        y_train = train["attack"]

        x_test = test.drop(columns=["dataset", "model", "attack"])
        y_test = test["attack"]

        le = LabelEncoder()
        y_train_enc = le.fit_transform(y_train)
        y_test_enc = le.transform(y_test)

        model.fit(x_train, y_train_enc)

        fi_dict = {'var': x_train.columns, 'fi': model.feature_importances_}
        fi_df = pd.DataFrame(fi_dict)
        fi_df["fi_rank"] = fi_df["fi"].rank(ascending=False)
        fi_df["dataset"] = dataset
        fi_all = pd.concat([fi_all, fi_df])

        preds = model.predict(x_test)
        test_all.append(le.inverse_transform(y_test_enc))
        preds_all.append(le.inverse_transform(preds))
        bacc_list.append(balanced_accuracy_score(y_test_enc, preds))
        kappa_list.append(cohen_kappa_score(y_test_enc, preds))

    test_preds = pd.DataFrame({'true': np.concatenate(test_all), 'pred': np.concatenate(preds_all)})
    print(confusion_matrix(test_preds['true'], test_preds['pred'], labels=le.classes_))
    print(balanced_accuracy_score(test_preds['true'], test_preds['pred']))
    results_dict = {'dataset': datasets, 'bacc': bacc_list, 'kappa': kappa_list}
    results_df = pd.DataFrame(results_dict)
    return results_df, fi_all

## XGBoost

### Default parameters

In [27]:
one_dataset_out_xgb, one_dataset_out_fi_xgb = leave_dataset_out()
one_dataset_out_xgb

100%|██████████| 22/22 [00:11<00:00,  1.97it/s]

[[10  3  0  0  0  0  8  1]
 [ 0 15  0  4  0  0  3  0]
 [ 0  0 47  1  0  7  0  9]
 [ 1  1  0 17  2  1  0  0]
 [ 0  0  2  0 83  3  0  0]
 [ 0  0  7  1  4 49  0  5]
 [ 3  3  0  0  0  0 16  0]
 [ 0  0  6  0  4  8  0 48]]





Unnamed: 0,dataset,bacc,kappa
0,Bioresponse,0.833333,0.857143
1,churn,0.708333,0.789256
2,cmc,0.875,0.929752
3,cnae-9,0.708333,0.647303
4,dna,0.5,0.576763
5,har,0.75,0.576763
6,madelon,0.5,0.342466
7,mfeat-factors,0.875,0.929752
8,mfeat-fourier,0.791667,0.789256
9,mfeat-karhunen,1.0,1.0


### Hyperparameters tuning

In [15]:
model=xgb.XGBClassifier(random_state=123)

param_grid = {
    'max_depth': [6, 9, 12],
    'learning_rate': [0.1, 0.3, 0.5],
    'n_estimators': [100, 200, 500]
}

one_dataset_out_xgb, one_dataset_out_fi_xgb = create_hyperparams_grid(model=model, param_grid=param_grid, exp_function=leave_dataset_out, crit="kappa")
one_dataset_out_xgb

100%|██████████| 22/22 [00:46<00:00,  2.13s/it]


kappa = 0.7193746614812835, params: {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 100}


100%|██████████| 22/22 [01:09<00:00,  3.16s/it]


kappa = 0.7161335577553092, params: {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 200}


100%|██████████| 22/22 [01:49<00:00,  5.00s/it]


kappa = 0.7226527788922988, params: {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 500}


100%|██████████| 22/22 [00:28<00:00,  1.31s/it]


kappa = 0.7194205405218793, params: {'max_depth': 6, 'learning_rate': 0.3, 'n_estimators': 100}


100%|██████████| 22/22 [00:42<00:00,  1.94s/it]


kappa = 0.7195350833238546, params: {'max_depth': 6, 'learning_rate': 0.3, 'n_estimators': 200}


100%|██████████| 22/22 [01:21<00:00,  3.69s/it]


kappa = 0.7201531321148997, params: {'max_depth': 6, 'learning_rate': 0.3, 'n_estimators': 500}


100%|██████████| 22/22 [00:25<00:00,  1.16s/it]


kappa = 0.7035192646644147, params: {'max_depth': 6, 'learning_rate': 0.5, 'n_estimators': 100}


100%|██████████| 22/22 [00:39<00:00,  1.78s/it]


kappa = 0.6945774315291675, params: {'max_depth': 6, 'learning_rate': 0.5, 'n_estimators': 200}


100%|██████████| 22/22 [01:22<00:00,  3.74s/it]


kappa = 0.70122661976182, params: {'max_depth': 6, 'learning_rate': 0.5, 'n_estimators': 500}


100%|██████████| 22/22 [00:53<00:00,  2.43s/it]


kappa = 0.7164377390812426, params: {'max_depth': 9, 'learning_rate': 0.1, 'n_estimators': 100}


100%|██████████| 22/22 [01:13<00:00,  3.35s/it]


kappa = 0.7199443604476735, params: {'max_depth': 9, 'learning_rate': 0.1, 'n_estimators': 200}


100%|██████████| 22/22 [02:03<00:00,  5.60s/it]


kappa = 0.722833514240635, params: {'max_depth': 9, 'learning_rate': 0.1, 'n_estimators': 500}


100%|██████████| 22/22 [00:31<00:00,  1.43s/it]


kappa = 0.716631848925691, params: {'max_depth': 9, 'learning_rate': 0.3, 'n_estimators': 100}


100%|██████████| 22/22 [00:46<00:00,  2.11s/it]


kappa = 0.7166056760740203, params: {'max_depth': 9, 'learning_rate': 0.3, 'n_estimators': 200}


100%|██████████| 22/22 [01:22<00:00,  3.76s/it]


kappa = 0.7016378240996715, params: {'max_depth': 9, 'learning_rate': 0.3, 'n_estimators': 500}


100%|██████████| 22/22 [00:25<00:00,  1.16s/it]


kappa = 0.6949172802485738, params: {'max_depth': 9, 'learning_rate': 0.5, 'n_estimators': 100}


100%|██████████| 22/22 [00:38<00:00,  1.76s/it]


kappa = 0.7071990908615706, params: {'max_depth': 9, 'learning_rate': 0.5, 'n_estimators': 200}


100%|██████████| 22/22 [01:12<00:00,  3.32s/it]


kappa = 0.7171343872022177, params: {'max_depth': 9, 'learning_rate': 0.5, 'n_estimators': 500}


100%|██████████| 22/22 [00:49<00:00,  2.27s/it]


kappa = 0.7264255984723959, params: {'max_depth': 12, 'learning_rate': 0.1, 'n_estimators': 100}


100%|██████████| 22/22 [01:14<00:00,  3.39s/it]


kappa = 0.7165611756693075, params: {'max_depth': 12, 'learning_rate': 0.1, 'n_estimators': 200}


100%|██████████| 22/22 [02:06<00:00,  5.75s/it]


kappa = 0.7165792864671788, params: {'max_depth': 12, 'learning_rate': 0.1, 'n_estimators': 500}


100%|██████████| 22/22 [00:31<00:00,  1.45s/it]


kappa = 0.7165924267466184, params: {'max_depth': 12, 'learning_rate': 0.3, 'n_estimators': 100}


100%|██████████| 22/22 [00:47<00:00,  2.16s/it]


kappa = 0.7167740369484411, params: {'max_depth': 12, 'learning_rate': 0.3, 'n_estimators': 200}


100%|██████████| 22/22 [01:25<00:00,  3.88s/it]


kappa = 0.7019663022444587, params: {'max_depth': 12, 'learning_rate': 0.3, 'n_estimators': 500}


100%|██████████| 22/22 [00:29<00:00,  1.33s/it]


kappa = 0.707335192160227, params: {'max_depth': 12, 'learning_rate': 0.5, 'n_estimators': 100}


100%|██████████| 22/22 [00:40<00:00,  1.84s/it]


kappa = 0.7169065778068658, params: {'max_depth': 12, 'learning_rate': 0.5, 'n_estimators': 200}


100%|██████████| 22/22 [01:20<00:00,  3.67s/it]

kappa = 0.7237723842660898, params: {'max_depth': 12, 'learning_rate': 0.5, 'n_estimators': 500}
Params: {'max_depth': 12, 'learning_rate': 0.1, 'n_estimators': 100} give best kappa equal 0.7264255984723959





Unnamed: 0,dataset,bacc,kappa
0,Bioresponse,0.833333,0.857143
1,churn,0.708333,0.789256
2,cmc,1.0,1.0
3,cnae-9,0.75,0.717842
4,dna,0.541667,0.650206
5,har,0.791667,0.647303
6,madelon,0.5,0.345455
7,mfeat-factors,0.875,0.929752
8,mfeat-fourier,0.791667,0.789256
9,mfeat-karhunen,1.0,1.0


In [16]:
one_dataset_out_xgb[["bacc", "kappa"]].agg(['mean', 'std'])

Unnamed: 0,bacc,kappa
mean,0.723958,0.726426
std,0.203509,0.204257


In [17]:
fi_agg = one_dataset_out_fi_xgb[["var", "fi_rank"]].groupby(["var"]).agg(['mean'])
fi_agg.columns = fi_agg.columns.to_flat_index()
fi_agg.columns = list(fi_agg.columns.map('_'.join))
fi_agg.nsmallest(n=20, columns=["fi_rank_mean"])

Unnamed: 0_level_0,fi_rank_mean
var,Unnamed: 1_level_1
bacc_test,1.045455
uncertainty_q25,3.454545
uncertainty_mean,3.681818
uncertainty_q50,5.590909
target_targets_consistency_in_neighborhood_q75,6.272727
uncertainty_q0,7.272727
target_approx_consistency_in_neighborhood_q25,8.681818
uncertainty_q75,8.727273
uncertainty_minmax,8.772727
uncertainty_q1,12.227273


## Random forest

### Default parameters

In [18]:
one_dataset_out_rf, one_dataset_out_fi_rf = leave_dataset_out(model=RandomForestClassifier(random_state=123))
one_dataset_out_rf

100%|██████████| 22/22 [00:08<00:00,  2.62it/s]


Unnamed: 0,dataset,bacc,kappa
0,Bioresponse,0.541667,0.5
1,churn,0.708333,0.789256
2,cmc,0.875,0.929752
3,cnae-9,0.75,0.720165
4,dna,0.625,0.647303
5,har,0.625,0.504167
6,madelon,0.5,0.231441
7,mfeat-factors,0.875,0.929752
8,mfeat-fourier,0.75,0.859504
9,mfeat-karhunen,1.0,1.0


### Hyperparameters tuning

In [19]:
model=RandomForestClassifier(random_state=123)

param_grid = {
    'max_depth': [50, 80, 110],
    'min_samples_split': [2, 5, 8],
    'n_estimators': [100, 200, 500]
}

one_dataset_out_rf, one_dataset_out_fi_rf = create_hyperparams_grid(model=model, param_grid=param_grid, exp_function=leave_dataset_out, crit="kappa")

100%|██████████| 22/22 [00:06<00:00,  3.56it/s]


kappa = 0.6728284307163721, params: {'max_depth': 50, 'min_samples_split': 2, 'n_estimators': 100}


100%|██████████| 22/22 [00:12<00:00,  1.82it/s]


kappa = 0.6764363508772088, params: {'max_depth': 50, 'min_samples_split': 2, 'n_estimators': 200}


100%|██████████| 22/22 [00:29<00:00,  1.34s/it]


kappa = 0.6794966734511179, params: {'max_depth': 50, 'min_samples_split': 2, 'n_estimators': 500}


100%|██████████| 22/22 [00:06<00:00,  3.52it/s]


kappa = 0.6667110785535114, params: {'max_depth': 50, 'min_samples_split': 5, 'n_estimators': 100}


100%|██████████| 22/22 [00:11<00:00,  1.92it/s]


kappa = 0.705193072684382, params: {'max_depth': 50, 'min_samples_split': 5, 'n_estimators': 200}


100%|██████████| 22/22 [00:28<00:00,  1.28s/it]


kappa = 0.6864914105327841, params: {'max_depth': 50, 'min_samples_split': 5, 'n_estimators': 500}


100%|██████████| 22/22 [00:07<00:00,  3.08it/s]


kappa = 0.6763163521189756, params: {'max_depth': 50, 'min_samples_split': 8, 'n_estimators': 100}


100%|██████████| 22/22 [00:12<00:00,  1.75it/s]


kappa = 0.6797261511582688, params: {'max_depth': 50, 'min_samples_split': 8, 'n_estimators': 200}


100%|██████████| 22/22 [00:31<00:00,  1.43s/it]


kappa = 0.6799531690147439, params: {'max_depth': 50, 'min_samples_split': 8, 'n_estimators': 500}


100%|██████████| 22/22 [00:06<00:00,  3.48it/s]


kappa = 0.6728284307163721, params: {'max_depth': 80, 'min_samples_split': 2, 'n_estimators': 100}


100%|██████████| 22/22 [00:12<00:00,  1.74it/s]


kappa = 0.6764363508772088, params: {'max_depth': 80, 'min_samples_split': 2, 'n_estimators': 200}


100%|██████████| 22/22 [00:32<00:00,  1.47s/it]


kappa = 0.6794966734511179, params: {'max_depth': 80, 'min_samples_split': 2, 'n_estimators': 500}


100%|██████████| 22/22 [00:06<00:00,  3.56it/s]


kappa = 0.6667110785535114, params: {'max_depth': 80, 'min_samples_split': 5, 'n_estimators': 100}


100%|██████████| 22/22 [00:11<00:00,  1.84it/s]


kappa = 0.705193072684382, params: {'max_depth': 80, 'min_samples_split': 5, 'n_estimators': 200}


100%|██████████| 22/22 [00:29<00:00,  1.34s/it]


kappa = 0.6864914105327841, params: {'max_depth': 80, 'min_samples_split': 5, 'n_estimators': 500}


100%|██████████| 22/22 [00:05<00:00,  3.85it/s]


kappa = 0.6763163521189756, params: {'max_depth': 80, 'min_samples_split': 8, 'n_estimators': 100}


100%|██████████| 22/22 [00:11<00:00,  1.94it/s]


kappa = 0.6797261511582688, params: {'max_depth': 80, 'min_samples_split': 8, 'n_estimators': 200}


100%|██████████| 22/22 [00:27<00:00,  1.25s/it]


kappa = 0.6799531690147439, params: {'max_depth': 80, 'min_samples_split': 8, 'n_estimators': 500}


100%|██████████| 22/22 [00:05<00:00,  3.72it/s]


kappa = 0.6728284307163721, params: {'max_depth': 110, 'min_samples_split': 2, 'n_estimators': 100}


100%|██████████| 22/22 [00:11<00:00,  1.88it/s]


kappa = 0.6764363508772088, params: {'max_depth': 110, 'min_samples_split': 2, 'n_estimators': 200}


100%|██████████| 22/22 [00:29<00:00,  1.35s/it]


kappa = 0.6794966734511179, params: {'max_depth': 110, 'min_samples_split': 2, 'n_estimators': 500}


100%|██████████| 22/22 [00:05<00:00,  3.77it/s]


kappa = 0.6667110785535114, params: {'max_depth': 110, 'min_samples_split': 5, 'n_estimators': 100}


100%|██████████| 22/22 [00:11<00:00,  1.88it/s]


kappa = 0.705193072684382, params: {'max_depth': 110, 'min_samples_split': 5, 'n_estimators': 200}


100%|██████████| 22/22 [00:29<00:00,  1.35s/it]


kappa = 0.6864914105327841, params: {'max_depth': 110, 'min_samples_split': 5, 'n_estimators': 500}


100%|██████████| 22/22 [00:06<00:00,  3.65it/s]


kappa = 0.6763163521189756, params: {'max_depth': 110, 'min_samples_split': 8, 'n_estimators': 100}


100%|██████████| 22/22 [00:11<00:00,  1.92it/s]


kappa = 0.6797261511582688, params: {'max_depth': 110, 'min_samples_split': 8, 'n_estimators': 200}


100%|██████████| 22/22 [00:28<00:00,  1.29s/it]

kappa = 0.6799531690147439, params: {'max_depth': 110, 'min_samples_split': 8, 'n_estimators': 500}
Params: {'max_depth': 50, 'min_samples_split': 5, 'n_estimators': 200} give best kappa equal 0.705193072684382





In [20]:
one_dataset_out_rf[["bacc", "kappa"]].agg(['mean', 'std'])

Unnamed: 0,bacc,kappa
mean,0.727746,0.705193
std,0.184059,0.22602


In [21]:
fi_agg = one_dataset_out_fi_rf[["var", "fi_rank"]].groupby(["var"]).agg(['mean'])
fi_agg.columns = fi_agg.columns.to_flat_index()
fi_agg.columns = list(fi_agg.columns.map('_'.join))
fi_agg.nsmallest(n=20, columns=["fi_rank_mean"])

Unnamed: 0_level_0,fi_rank_mean
var,Unnamed: 1_level_1
bacc_test,1.0
uncertainty_mean,2.0
uncertainty_q25,4.136364
uncertainty_q75,4.636364
uncertainty_minmax,5.318182
uncertainty_q50,5.454545
uncertainty_q1,6.863636
target_approx_consistency_in_neighborhood_mean,7.5
target_approx_consistency_in_neighborhood_q25,8.909091
target_approx_consistency_in_neighborhood_q50,9.272727


# Leave-one-model-out

In [38]:
def leave_model_out(model=xgb.XGBClassifier(random_state=123)):

    models = np.unique(attrs_agg_correct_cols["model"])
    kappa_list = []
    bacc_list = []
    fi_all = pd.DataFrame()

    for selected_model in models:

        train = attrs_agg_correct_cols[attrs_agg_correct_cols["model"] != selected_model]
        test = attrs_agg_correct_cols[attrs_agg_correct_cols["model"] == selected_model]

        y = attrs_agg_correct_cols["attack"]

        x_train = train.drop(columns=["dataset", "model", "attack"])
        y_train = train["attack"]

        x_test = test.drop(columns=["dataset", "model", "attack"])
        y_test = test["attack"]

        le = LabelEncoder()
        # le.fit(y)
        y_train_enc = le.fit_transform(y_train)
        # y_test_enc = le.transform(y_test)

        model.fit(x_train, y_train_enc)

        fi_dict = {'var': x_train.columns, 'fi': model.feature_importances_}
        fi_df = pd.DataFrame(fi_dict)
        fi_df["fi_rank"] = fi_df["fi"].rank(ascending=False)
        fi_df["model"] = selected_model
        fi_all = pd.concat([fi_all, fi_df])

        preds = model.predict(x_test)
        bacc_list.append(balanced_accuracy_score(y_test, le.inverse_transform(preds)))
        kappa_list.append(cohen_kappa_score(y_test, le.inverse_transform(preds)))

    results_dict = {'model': models, 'bacc': bacc_list, 'kappa': kappa_list}
    results_df = pd.DataFrame(results_dict)
    return results_df, fi_all

## XGBoost

In [39]:
one_model_out_xgb, one_model_out_fi_xgb = leave_model_out()
one_model_out_xgb



Unnamed: 0,model,bacc,kappa
0,lin,0.840909,0.787879
1,nn,0.181818,0.14611
2,svm,0.727273,0.636364
3,xgb,0.561364,0.425839


In [40]:
one_model_out_xgb[["bacc", "kappa"]].agg(['mean', 'std'])

Unnamed: 0,bacc,kappa
mean,0.577841,0.499048
std,0.287889,0.278211


## Random forest

In [41]:
one_model_out_rf, one_model_out_fi_rf = leave_model_out(model=RandomForestClassifier(random_state=123))
one_model_out_rf



Unnamed: 0,model,bacc,kappa
0,lin,0.818182,0.757576
1,nn,0.2,0.166667
2,svm,0.806818,0.744361
3,xgb,0.55,0.412019


In [42]:
one_model_out_rf[["bacc", "kappa"]].agg(['mean', 'std'])

Unnamed: 0,bacc,kappa
mean,0.59375,0.520156
std,0.290242,0.284771


# 10-fold cross validation

In [43]:
def cross_validation(model=xgb.XGBClassifier(random_state=123)):

    kappa_list = []
    bacc_list = []
    iter_cv = []
    fi_all = pd.DataFrame()

    le = LabelEncoder()
    x = attrs_agg_correct_cols.drop(columns=["dataset", "model", "attack"])
    y = attrs_agg_correct_cols["attack"]
    y_enc = le.fit_transform(y)

    skf = StratifiedKFold(n_splits=10)
    cv = 0
    for train, test in skf.split(x, y_enc):

        x_train = x.iloc[train]
        y_train = y_enc[train]

        x_test = x.iloc[test]
        y_test = y_enc[test]

        model.fit(x_train, y_train)

        fi_dict = {'var': x_train.columns, 'fi': model.feature_importances_}
        fi_df = pd.DataFrame(fi_dict)
        fi_df["fi_rank"] = fi_df["fi"].rank(ascending=False)
        fi_df["cv"] = cv
        fi_all = pd.concat([fi_all, fi_df])

        preds = model.predict(x_test)
        bacc_list.append(balanced_accuracy_score(y_test, preds))
        kappa_list.append(cohen_kappa_score(y_test, preds))
        iter_cv.append(cv)
        cv = cv + 1

    results_dict = {'cv': iter_cv, 'bacc': bacc_list, 'kappa': kappa_list}
    results_df = pd.DataFrame(results_dict)
    return results_df, fi_all

## XGBoost

In [44]:
cv_xgb, cv_fi_xgb = cross_validation()
cv_xgb

Unnamed: 0,cv,bacc,kappa
0,0,0.779762,0.810316
1,1,0.733135,0.686727
2,2,0.553571,0.513158
3,3,0.758929,0.804749
4,4,0.895833,0.935484
5,5,0.58631,0.675439
6,6,0.643849,0.582103
7,7,0.644841,0.611208
8,8,0.780754,0.773601
9,9,0.300595,0.381162


In [45]:
cv_xgb[["bacc", "kappa"]].agg(['mean', 'std'])

Unnamed: 0,bacc,kappa
mean,0.667758,0.677395
std,0.165158,0.162629


## Random forest

In [46]:
cv_rf, cv_fi_rf = cross_validation(model=RandomForestClassifier(random_state=123))
cv_rf

Unnamed: 0,cv,bacc,kappa
0,0,0.60119,0.620948
1,1,0.685516,0.687243
2,2,0.488839,0.383874
3,3,0.616071,0.706866
4,4,0.833333,0.903141
5,5,0.678571,0.678819
6,6,0.625992,0.518229
7,7,0.525794,0.482065
8,8,0.718254,0.675723
9,9,0.35119,0.312389


In [47]:
cv_rf[["bacc", "kappa"]].agg(['mean', 'std'])

Unnamed: 0,bacc,kappa
mean,0.612475,0.59693
std,0.133776,0.174242


## Save data

In [48]:
def preprocess_output(df, scenario, mod):
    df["scenario"] = scenario
    df["model_class"] = mod

    return df

one_dataset_out_rf = preprocess_output(one_dataset_out_rf, "one-data-set-out", "RF")
one_dataset_out_xgb = preprocess_output(one_dataset_out_xgb, "one-data-set-out", "XGB")
one_model_out_rf = preprocess_output(one_model_out_rf, "one-model-out", "RF")
one_model_out_xgb = preprocess_output(one_model_out_xgb, "one-model-out", "XGB")
cv_rf = preprocess_output(cv_rf, "10-fold cross-validation", "RF")
cv_xgb = preprocess_output(cv_xgb, "10-fold cross-validation", "XGB")

all_bacc = pd.concat([one_dataset_out_rf, one_dataset_out_xgb,
                      one_model_out_rf, one_model_out_xgb,
                      cv_rf, cv_xgb])

In [49]:
all_bacc.to_csv("results/isolation_bacc_nn.csv", index=False)

In [50]:
one_dataset_out_fi_rf = preprocess_output(one_dataset_out_fi_rf, "one-data-set-out", "RF")
one_dataset_out_fi_xgb = preprocess_output(one_dataset_out_fi_xgb, "one-data-set-out", "XGB")
one_model_out_fi_rf = preprocess_output(one_model_out_fi_rf, "one-model-out", "RF")
one_model_out_fi_xgb = preprocess_output(one_model_out_fi_xgb, "one-model-out", "XGB")
cv_fi_rf = preprocess_output(cv_fi_rf, "10-fold cross-validation", "RF")
cv_fi_xgb = preprocess_output(cv_fi_xgb, "10-fold cross-validation", "XGB")

all_fi = pd.concat([one_dataset_out_fi_rf, one_dataset_out_fi_xgb,
                      one_model_out_fi_rf, one_model_out_fi_xgb,
                      cv_fi_rf, cv_fi_xgb])

all_fi.to_csv("results/isolation_fi_nn.csv", index=False)