In [1]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import *
from utils import *

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_filepath = "data/by_type/t1/counts_ctc_simulated_123_5k_t1.tsv"
true_results_filepath = "data/by_type/t1/ids_ctc_simulated_123_5k_t1.tsv"
train_indices_filepath = "data/by_type/t1/train_indices.npy"
test_indices_filepath = "data/by_type/t1/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60
FILTER_BY_ZERO_THRESHOLD = 0.97

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [3]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

In [7]:
filtered_by_zero_values_train_data, filtered_by_zero_values_test_data, filtered_by_zero_healthy_train_data, filtered_by_zero_healthy_test_data, cancer_train_data, cancer_test_data = data_object.get_filtered_by_zero_data(train_data, test_data, FILTER_BY_ZERO_THRESHOLD)

Number of deleted columns: 714


### Defined classifiers

In [4]:
# Logistic Regression
log_clf = LogisticRegression(random_state=SEED,
                             class_weight='balanced',
                             penalty='l1',
                             C=0.0005,
                             solver='liblinear')

# XGBoost
xgb_clf = XGBClassifier(random_state=SEED,
                        booster='gbtree',
                        eta=0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5)

# K-Nearest Neighbors
knn_clf = KNeighborsClassifier(n_neighbors=5,
                               algorithm='kd_tree',
                               weights='distance')

# Isolation Forest
if_clf = IsolationForest(max_features=1,
                         n_estimators=1000,
                         bootstrap=True,
                         max_samples=1000,
                         n_jobs=12,
                         random_state=SEED)

# SVM
svm_clf = svm.SVC(kernel='linear',
                  class_weight='balanced',
                  probability=True,
                  random_state=SEED)

# Light GBM
lgbm_clf = LGBMClassifier(boosting_type='gbdt',
                          min_child_weight=0.001,
                          max_depth=10,
                          random_state=SEED,)

# Random Forest
rf_clf = RandomForestClassifier(random_state=SEED,
                                n_estimators=2000,
                                criterion="log_loss")

# Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(random_state=SEED,
                                         n_estimators=2000)

In [5]:
names = ["Logistic Regression", "XGBoost", "K-Nearest Neighbors", "Isolated Forest",
         "SVM", "Light GBM", "Random Forest", "Balanced Random Forest"]
    
classifiers = [log_clf, xgb_clf, knn_clf, if_clf, svm_clf, lgbm_clf, rf_clf, brf_clf]

### Regular data

In [6]:
run_all_models(classifiers, names, train_data, train_true_results, test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,1.0,1.0,0.933333,0.965517
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.892112,0.0,0.0,0.0
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Scaled data

In [7]:
run_all_models(classifiers, names, scaled_train_data, train_true_results, scaled_test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Logistic Regression, XGBoost, Isolated Forest, SVM, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.889145,1.0,0.866667,0.927659
XGBoost,0.966667,1.0,1.0,0.933333,0.965517
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.688889,0.772022,1.0,0.377778,0.547619
Light GBM,0.944444,1.0,1.0,0.888889,0.939974
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Cut by max data

In [8]:
run_all_models(classifiers, names, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results)

Features number: 64
Best balanced accuracy: Random Forest
Best ROC AUC: XGBoost, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Random Forest
Best recall: Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,1.0,1.0,0.933333,0.962963
K-Nearest Neighbors,0.544444,0.776823,1.0,0.088889,0.161765
Isolated Forest,0.968278,0.999045,1.0,0.936557,0.96722
SVM,0.577616,0.690577,0.050572,0.2,0.079531
Light GBM,0.999616,0.999983,0.9375,1.0,0.967742
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.99936,1.0,0.900735,1.0,0.947581


### Pca reduced + cut by max data

In [9]:
run_all_models(classifiers, names, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results)

Features number: 60
Best balanced accuracy: Random Forest
Best ROC AUC: XGBoost, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.877778,1.0,1.0,0.755556,0.859259
K-Nearest Neighbors,0.588889,0.832267,1.0,0.177778,0.293129
Isolated Forest,0.94922,0.999011,1.0,0.898439,0.946462
SVM,0.665976,0.717626,0.065266,0.4,0.108758
Light GBM,0.988889,0.999983,1.0,0.977778,0.988506
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.999744,1.0,0.958333,1.0,0.978495


### P-values regular data

In [10]:
statistics = calculate_statistics(train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_regular_train_data = train_data[statistics.index]
p_values_regular_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_regular_train_data, train_true_results, p_values_regular_test_data, test_true_results)

Features number: 1540
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.922222,1.0,1.0,0.844444,0.915344
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.511111,0.895608,0.333333,0.022222,0.041667
Light GBM,0.933333,1.0,1.0,0.866667,0.928571
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values scaled data

In [11]:
p_values_scaled_train_data = scaled_train_data[statistics.index]
p_values_scaled_test_data = scaled_test_data[statistics.index]

run_all_models(classifiers, names, p_values_scaled_train_data, train_true_results, p_values_scaled_test_data, test_true_results)

Features number: 1540
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Logistic Regression, XGBoost, Isolated Forest, SVM, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.889145,1.0,0.866667,0.927659
XGBoost,0.922222,1.0,1.0,0.844444,0.915344
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.677778,0.762412,1.0,0.355556,0.521303
Light GBM,0.9,1.0,1.0,0.8,0.888889
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values cut by max

In [12]:
statistics = calculate_statistics(cut_by_max_train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_cut_by_max_train_data = train_data[statistics.index]
p_values_cut_by_max_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_cut_by_max_train_data, train_true_results, p_values_cut_by_max_test_data, test_true_results)

Features number: 39
Best balanced accuracy: Random Forest
Best ROC AUC: Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Random Forest
Best recall: Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.922222,0.999949,1.0,0.844444,0.915344
K-Nearest Neighbors,0.6,0.876823,1.0,0.2,0.323529
Isolated Forest,0.978127,0.999284,1.0,0.956255,0.97763
SVM,0.694653,0.723066,0.042102,0.533333,0.077783
Light GBM,0.999616,0.999966,0.9375,1.0,0.967742
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.998721,1.0,0.820433,1.0,0.900735


### P-values pca reduced

In [13]:
statistics = calculate_statistics(pca_reduced_train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_pca_reduced_train_data = train_data[statistics.index]
p_values_pca_reduced_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_pca_reduced_train_data, train_true_results, p_values_pca_reduced_test_data, test_true_results)

Features number: 37
Best balanced accuracy: Random Forest
Best ROC AUC: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.922222,1.0,1.0,0.844444,0.909524
K-Nearest Neighbors,0.6,0.876533,1.0,0.2,0.323529
Isolated Forest,0.968406,0.998567,1.0,0.936812,0.967343
SVM,0.705509,0.725744,0.043741,0.555556,0.080865
Light GBM,0.988889,1.0,1.0,0.977778,0.988506
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.998081,1.0,0.751253,1.0,0.85761


### Logistic regression coefficients

In [14]:
logistic_regression_features = pd.read_csv("features/logistic_regression.csv", index_col=0)
logistic_regression_features = logistic_regression_features[(logistic_regression_features != 0).all(1)]
logistic_regression_features = logistic_regression_features.sort_values(by="feature_importance_vals", ascending=False)

lg_train_data = train_data[logistic_regression_features.index[:25]]
lg_test_data = test_data[logistic_regression_features.index[:25]]
run_all_models(classifiers, names, lg_train_data, train_true_results, lg_test_data, test_true_results)

Features number: 25
Best balanced accuracy: Random Forest, Balanced Random Forest
Best ROC AUC: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,0.999949,1.0,0.933333,0.965517
K-Nearest Neighbors,0.5,0.555445,0.0,0.0,0.0
Isolated Forest,0.999616,1.0,1.0,0.999233,0.999616
SVM,0.749544,0.698832,0.489478,0.511111,0.446008
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### First PCA component coefficients

In [15]:
pca_features = pd.read_csv("features/pca.csv", index_col=0)
pca_features = pca_features[(pca_features != 0).all(1)]
pca_features.sort_values(by="feature_importance_vals", ascending=False)

pca_train_data = train_data[pca_features.index[:100]]
pca_test_data = test_data[pca_features.index[:100]]
run_all_models(classifiers, names, pca_train_data, train_true_results, pca_test_data, test_true_results)

Features number: 100
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.911111,1.0,1.0,0.822222,0.902116
K-Nearest Neighbors,0.733333,0.92212,1.0,0.466667,0.634481
Isolated Forest,0.896265,1.0,1.0,0.79253,0.884218
SVM,0.638288,0.759359,0.092576,0.311111,0.142007
Light GBM,0.977778,1.0,1.0,0.955556,0.97619
Random Forest,0.966667,1.0,1.0,0.933333,0.964696
Balanced Random Forest,0.999233,1.0,0.882353,1.0,0.9375


### XGBoost feature importance

In [16]:
pca_reduced_xgboost_features = pd.read_csv("features/pca_reduced_xgboost.csv", index_col=0)
pca_reduced_xgboost_features = pca_reduced_xgboost_features[(pca_reduced_xgboost_features != 0).all(1)]
pca_reduced_xgboost_features.sort_values(by="feature_importance_vals", ascending=False)

xgb_train_data = train_data[pca_reduced_xgboost_features.index]
xgb_test_data = test_data[pca_reduced_xgboost_features.index]
run_all_models(classifiers, names, xgb_train_data, train_true_results, xgb_test_data, test_true_results)

Features number: 11
Best balanced accuracy: XGBoost, Light GBM, Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: XGBoost, Light GBM, Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.722222,0.944086,1.0,0.444444,0.612836
Isolated Forest,0.966232,1.0,1.0,0.932464,0.965047
SVM,0.584941,0.559546,0.018324,0.488889,0.035205
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.998721,1.0,0.818713,1.0,0.900178


### LightGBM feature importance

In [17]:
pca_reduced_lightgbm_features = pd.read_csv("features/pca_reduced_lightgbm.csv", index_col=0)
pca_reduced_lightgbm_features = pca_reduced_lightgbm_features[(pca_reduced_lightgbm_features != 0).all(1)]
pca_reduced_lightgbm_features.sort_values(by="feature_importance_vals", ascending=False)

lgbm_train_data = train_data[pca_reduced_lightgbm_features.index]
lgbm_test_data = test_data[pca_reduced_lightgbm_features.index]
run_all_models(classifiers, names, lgbm_train_data, train_true_results, lgbm_test_data, test_true_results)

Features number: 55
Best balanced accuracy: Light GBM, Random Forest
Best ROC AUC: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Light GBM, Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.988889,1.0,1.0,0.977778,0.988506
K-Nearest Neighbors,0.655556,0.876772,1.0,0.311111,0.473684
Isolated Forest,0.932847,0.998499,1.0,0.865695,0.92796
SVM,0.697135,0.736233,0.069211,0.466667,0.119565
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.999744,1.0,0.958333,1.0,0.978495


### Random forest feature importance

In [18]:
random_forest_regular_features = pd.read_csv("features/regular_random_forest.csv", index_col=0)
random_forest_regular_features = random_forest_regular_features[(random_forest_regular_features != 0).all(1)]
random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

rf_train_data = train_data[random_forest_regular_features.index[:50]]
rf_test_data = test_data[random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, rf_train_data, train_true_results, rf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.955556,1.0,1.0,0.911111,0.953202
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.70932,0.730349,0.652039,0.422222,0.498737
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Balanced random forest feature importance

In [19]:
balanced_random_forest_regular_features = pd.read_csv("features/regular_balanced_random_forest.csv", index_col=0)
balanced_random_forest_regular_features = balanced_random_forest_regular_features[(balanced_random_forest_regular_features != 0).all(1)]
balanced_random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

brf_train_data = train_data[balanced_random_forest_regular_features.index[:50]]
brf_test_data = test_data[balanced_random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, brf_train_data, train_true_results, brf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.977778,1.0,1.0,0.955556,0.977011
K-Nearest Neighbors,0.611111,0.866581,1.0,0.222222,0.362573
Isolated Forest,0.999872,1.0,1.0,0.999744,0.999872
SVM,0.724252,0.590296,0.245022,0.466667,0.317477
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with biggest error for autoencoder 

In [20]:
autoencoder_features = pd.read_csv("features/autoencoder_new.csv", index_col=0)
autoencoder_features.sort_values(by="feature_importance_vals", ascending=False)

autoencoder_train_data = train_data[autoencoder_features.index[:5]]
autoencoder_test_data = test_data[autoencoder_features.index[:5]]
run_all_models(classifiers, names, autoencoder_train_data, train_true_results, autoencoder_test_data, test_true_results)

Features number: 5
Best balanced accuracy: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: XGBoost, Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.954532,0.999233,0.835294,0.911111,0.870833
Isolated Forest,0.999744,1.0,1.0,0.999488,0.999744
SVM,0.676882,0.799488,0.742424,0.355556,0.461421
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with the highest variance between

In [6]:
variance_between_features = pd.read_csv("features/regular_variance_between.csv", index_col=0)
variance_between_features.sort_values(by="feature_importance_vals", ascending=False)

variance_between_train_data = train_data[variance_between_features.index[:50]]
variance_between_test_data = test_data[variance_between_features.index[:50]]
run_all_models(classifiers, names, variance_between_train_data, train_true_results, variance_between_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.944444,1.0,1.0,0.888889,0.937224
K-Nearest Neighbors,0.5,0.633333,0.0,0.0,0.0
Isolated Forest,0.999744,1.0,1.0,0.999488,0.999744
SVM,0.782621,0.664381,0.347606,0.577778,0.432169
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Filtered by zero values for regular data

In [8]:
run_all_models(classifiers, names, filtered_by_zero_values_train_data, train_true_results, filtered_by_zero_values_test_data, test_true_results)

Features number: 1286
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,1.0,1.0,0.933333,0.965517
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.499872,0.885393,0.0,0.0,0.0
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0
