In [1]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import *
from utils import *

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60
FILTER_BY_ZERO_THRESHOLD = 0.97

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [6]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

In [7]:
filtered_by_zero_values_train_data, filtered_by_zero_values_test_data, filtered_by_zero_healthy_train_data, filtered_by_zero_healthy_test_data, cancer_train_data, cancer_test_data = data_object.get_filtered_by_zero_data(train_data, test_data, FILTER_BY_ZERO_THRESHOLD)

Number of deleted columns: 714


### Defined classifiers

In [3]:
# Logistic Regression
log_clf = LogisticRegression(random_state=SEED,
                             class_weight='balanced',
                             penalty='l1',
                             C=0.0005,
                             solver='liblinear')

# XGBoost
xgb_clf = XGBClassifier(random_state=SEED,
                        booster='gbtree',
                        eta=0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5)

# K-Nearest Neighbors
knn_clf = KNeighborsClassifier(n_neighbors=5,
                               algorithm='kd_tree',
                               weights='distance')

# Isolation Forest
if_clf = IsolationForest(max_features=1,
                         n_estimators=1000,
                         bootstrap=True,
                         max_samples=1000,
                         n_jobs=12,
                         random_state=SEED)

# SVM
svm_clf = svm.SVC(kernel='linear',
                  class_weight='balanced',
                  probability=True,
                  random_state=SEED)

# Light GBM
lgbm_clf = LGBMClassifier(boosting_type='gbdt',
                          min_child_weight=0.001,
                          max_depth=10,
                          random_state=SEED,)

# Random Forest
rf_clf = RandomForestClassifier(random_state=SEED,
                                n_estimators=2000,
                                criterion="log_loss")

# Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(random_state=SEED,
                                         n_estimators=2000)

In [4]:
names = ["Logistic Regression", "XGBoost", "K-Nearest Neighbors", "Isolated Forest",
         "SVM", "Light GBM", "Random Forest", "Balanced Random Forest"]
    
classifiers = [log_clf, xgb_clf, knn_clf, if_clf, svm_clf, lgbm_clf, rf_clf, brf_clf]

### Regular data

In [6]:
run_all_models(classifiers, names, train_data, train_true_results, test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.922222,1.0,1.0,0.844444,0.911681
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.806447,0.0,0.0,0.0
Light GBM,0.944444,1.0,1.0,0.888889,0.939153
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Scaled data

In [7]:
run_all_models(classifiers, names, scaled_train_data, train_true_results, scaled_test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Logistic Regression, XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.966667,0.933419,1.0,0.933333,0.965517
XGBoost,0.922222,1.0,1.0,0.844444,0.911681
K-Nearest Neighbors,0.5,0.499488,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.610599,0.775799,0.652381,0.222222,0.321034
Light GBM,0.966667,1.0,1.0,0.933333,0.964696
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Cut by max data

In [8]:
run_all_models(classifiers, names, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results)

Features number: 66
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Light GBM, Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.977778,1.0,1.0,0.955556,0.977011
K-Nearest Neighbors,0.522222,0.742637,0.666667,0.044444,0.083333
Isolated Forest,0.980941,0.999966,1.0,0.961883,0.980525
SVM,0.695344,0.709798,0.064954,0.466667,0.113955
Light GBM,0.988889,1.0,1.0,0.977778,0.988506
Random Forest,0.944444,1.0,1.0,0.888889,0.940887
Balanced Random Forest,0.998209,0.999983,0.768208,1.0,0.867418


### Pca reduced + cut by max data

In [9]:
run_all_models(classifiers, names, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results)

Features number: 60
Best balanced accuracy: XGBoost, Light GBM
Best ROC AUC: XGBoost, Light GBM, Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: XGBoost, Light GBM, Balanced Random Forest
Best F1 score: XGBoost, Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.555556,0.809363,1.0,0.111111,0.194444
Isolated Forest,0.955615,0.999881,1.0,0.91123,0.953489
SVM,0.669745,0.472516,0.038587,0.488889,0.071315
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,0.955556,1.0,1.0,0.911111,0.952381
Balanced Random Forest,0.997058,0.999983,0.666667,1.0,0.798535


### P-values regular data

In [10]:
statistics = calculate_statistics(train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_regular_train_data = train_data[statistics.index]
p_values_regular_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_regular_train_data, train_true_results, p_values_regular_test_data, test_true_results)

Features number: 1348
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,1.0,1.0,0.933333,0.965517
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.511111,0.787345,0.333333,0.022222,0.041667
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values scaled data

In [11]:
p_values_scaled_train_data = scaled_train_data[statistics.index]
p_values_scaled_test_data = scaled_test_data[statistics.index]

run_all_models(classifiers, names, p_values_scaled_train_data, train_true_results, p_values_scaled_test_data, test_true_results)

Features number: 1348
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Logistic Regression, XGBoost, Isolated Forest, SVM, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.966667,0.933419,1.0,0.933333,0.965517
XGBoost,0.966667,1.0,1.0,0.933333,0.965517
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.655556,0.756383,1.0,0.311111,0.457219
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values cut by max

In [12]:
statistics = calculate_statistics(cut_by_max_train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_cut_by_max_train_data = train_data[statistics.index]
p_values_cut_by_max_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_cut_by_max_train_data, train_true_results, p_values_cut_by_max_test_data, test_true_results)

Features number: 14
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Isolated Forest, Light GBM, Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Isolated Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,0.999966,1.0,0.933333,0.964696
K-Nearest Neighbors,0.577778,0.817566,1.0,0.155556,0.267974
Isolated Forest,0.992965,1.0,1.0,0.98593,0.992914
SVM,0.703863,0.739217,0.047373,0.533333,0.086919
Light GBM,0.988889,1.0,1.0,0.977778,0.988506
Random Forest,0.955556,1.0,1.0,0.911111,0.953202
Balanced Random Forest,0.997698,0.999881,0.717391,1.0,0.834586


### P-values pca reduced

In [13]:
statistics = calculate_statistics(pca_reduced_train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_pca_reduced_train_data = train_data[statistics.index]
p_values_pca_reduced_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_pca_reduced_train_data, train_true_results, p_values_pca_reduced_test_data, test_true_results)

Features number: 12
Best balanced accuracy: XGBoost, Light GBM
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: XGBoost, Light GBM, Balanced Random Forest
Best F1 score: XGBoost, Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.6,0.773565,1.0,0.2,0.329893
Isolated Forest,0.979023,1.0,1.0,0.958046,0.978564
SVM,0.658745,0.677872,0.031755,0.488889,0.059628
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,0.922222,1.0,1.0,0.844444,0.914432
Balanced Random Forest,0.996291,0.999505,0.612458,1.0,0.758366


### Logistic regression coefficients

In [14]:
logistic_regression_features = pd.read_csv("features/logistic_regression.csv", index_col=0)
logistic_regression_features = logistic_regression_features[(logistic_regression_features != 0).all(1)]
logistic_regression_features = logistic_regression_features.sort_values(by="feature_importance_vals", ascending=False)

lg_train_data = train_data[logistic_regression_features.index[:25]]
lg_test_data = test_data[logistic_regression_features.index[:25]]
run_all_models(classifiers, names, lg_train_data, train_true_results, lg_test_data, test_true_results)

Features number: 25
Best balanced accuracy: Light GBM, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Light GBM, Balanced Random Forest
Best F1 score: Light GBM, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,1.0,1.0,0.933333,0.964696
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,0.999616,1.0,1.0,0.999233,0.999616
SVM,0.698081,0.628106,0.546154,0.4,0.46
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,0.988889,1.0,1.0,0.977778,0.988506
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### First PCA component coefficients

In [15]:
pca_features = pd.read_csv("features/pca.csv", index_col=0)
pca_features = pca_features[(pca_features != 0).all(1)]
pca_features.sort_values(by="feature_importance_vals", ascending=False)

pca_train_data = train_data[pca_features.index[:100]]
pca_test_data = test_data[pca_features.index[:100]]
run_all_models(classifiers, names, pca_train_data, train_true_results, pca_test_data, test_true_results)

Features number: 100
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.9,1.0,1.0,0.8,0.885714
K-Nearest Neighbors,0.644444,0.806788,1.0,0.288889,0.447368
Isolated Forest,0.919417,1.0,1.0,0.838833,0.912277
SVM,0.570964,0.507018,0.042197,0.2,0.068555
Light GBM,0.977522,1.0,0.960784,0.955556,0.955357
Random Forest,0.933333,1.0,1.0,0.866667,0.927659
Balanced Random Forest,0.998465,1.0,0.790936,1.0,0.882862


### XGBoost feature importance

In [16]:
pca_reduced_xgboost_features = pd.read_csv("features/pca_reduced_xgboost.csv", index_col=0)
pca_reduced_xgboost_features = pca_reduced_xgboost_features[(pca_reduced_xgboost_features != 0).all(1)]
pca_reduced_xgboost_features.sort_values(by="feature_importance_vals", ascending=False)

xgb_train_data = train_data[pca_reduced_xgboost_features.index]
xgb_test_data = test_data[pca_reduced_xgboost_features.index]
run_all_models(classifiers, names, xgb_train_data, train_true_results, xgb_test_data, test_true_results)

Features number: 11
Best balanced accuracy: XGBoost, Light GBM
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest
Best recall: XGBoost, Light GBM, Balanced Random Forest
Best F1 score: XGBoost, Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.67765,0.887576,0.952381,0.355556,0.515152
Isolated Forest,0.975953,1.0,1.0,0.951906,0.975345
SVM,0.639797,0.456076,0.024518,0.511111,0.046467
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,0.988889,1.0,1.0,0.977778,0.988506
Balanced Random Forest,0.99693,0.999642,0.655487,1.0,0.790936


### LightGBM feature importance

In [17]:
pca_reduced_lightgbm_features = pd.read_csv("features/pca_reduced_lightgbm.csv", index_col=0)
pca_reduced_lightgbm_features = pca_reduced_lightgbm_features[(pca_reduced_lightgbm_features != 0).all(1)]
pca_reduced_lightgbm_features.sort_values(by="feature_importance_vals", ascending=False)

lgbm_train_data = train_data[pca_reduced_lightgbm_features.index]
lgbm_test_data = test_data[pca_reduced_lightgbm_features.index]
run_all_models(classifiers, names, lgbm_train_data, train_true_results, lgbm_test_data, test_true_results)

Features number: 55
Best balanced accuracy: XGBoost, Light GBM
Best ROC AUC: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest
Best recall: XGBoost, Light GBM, Balanced Random Forest
Best F1 score: XGBoost, Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.622094,0.796504,0.933333,0.244444,0.378431
Isolated Forest,0.949348,0.999505,1.0,0.898695,0.946567
SVM,0.675467,0.483346,0.034343,0.533333,0.064348
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,0.977778,1.0,1.0,0.955556,0.977011
Balanced Random Forest,0.997058,1.0,0.667391,1.0,0.798872


### Random forest feature importance

In [18]:
random_forest_regular_features = pd.read_csv("features/regular_random_forest.csv", index_col=0)
random_forest_regular_features = random_forest_regular_features[(random_forest_regular_features != 0).all(1)]
random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

rf_train_data = train_data[random_forest_regular_features.index[:50]]
rf_test_data = test_data[random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, rf_train_data, train_true_results, rf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.955556,1.0,1.0,0.911111,0.951469
K-Nearest Neighbors,0.5,0.511111,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.732054,0.51222,0.704365,0.466667,0.553473
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Balanced random forest feature importance

In [19]:
balanced_random_forest_regular_features = pd.read_csv("features/regular_balanced_random_forest.csv", index_col=0)
balanced_random_forest_regular_features = balanced_random_forest_regular_features[(balanced_random_forest_regular_features != 0).all(1)]
balanced_random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

brf_train_data = train_data[balanced_random_forest_regular_features.index[:50]]
brf_test_data = test_data[balanced_random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, brf_train_data, train_true_results, brf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: XGBoost, Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: XGBoost, Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: XGBoost, Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.511111,0.698184,0.333333,0.022222,0.041667
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.830519,0.770785,0.60303,0.666667,0.628829
Light GBM,0.966667,1.0,1.0,0.933333,0.964696
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with biggest error for autoencoder 

In [20]:
autoencoder_features = pd.read_csv("features/autoencoder_new.csv", index_col=0)
autoencoder_features.sort_values(by="feature_importance_vals", ascending=False)

autoencoder_train_data = train_data[autoencoder_features.index[:5]]
autoencoder_test_data = test_data[autoencoder_features.index[:5]]
run_all_models(classifiers, names, autoencoder_train_data, train_true_results, autoencoder_test_data, test_true_results)

Features number: 5
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.977778,1.0,1.0,0.955556,0.97619
K-Nearest Neighbors,0.93295,0.988565,0.928327,0.866667,0.895676
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.60996,0.555692,0.559524,0.222222,0.312114
Light GBM,0.5,0.5,0.0,0.0,0.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with the highest variance between

In [5]:
variance_between_features = pd.read_csv("features/regular_variance_between.csv", index_col=0)
variance_between_features.sort_values(by="feature_importance_vals", ascending=False)

variance_between_train_data = train_data[variance_between_features.index[:50]]
variance_between_test_data = test_data[variance_between_features.index[:50]]
run_all_models(classifiers, names, variance_between_train_data, train_true_results, variance_between_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.933333,1.0,1.0,0.866667,0.928571
K-Nearest Neighbors,0.5,0.588889,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.708681,0.717626,0.482143,0.422222,0.442908
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Filtered by zero values for regular data

In [8]:
run_all_models(classifiers, names, filtered_by_zero_values_train_data, train_true_results, filtered_by_zero_values_test_data, test_true_results)

Features number: 1286
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.911111,1.0,1.0,0.822222,0.897436
K-Nearest Neighbors,0.5,0.583636,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.782638,0.0,0.0,0.0
Light GBM,0.933333,1.0,1.0,0.866667,0.925926
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0
