In [20]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import *
from utils import *

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [21]:
data_filepath = "data/by_type/t2/counts_ctc_simulated_123_5k_t2.tsv"
true_results_filepath = "data/by_type/t2/ids_ctc_simulated_123_5k_t2.tsv"
train_indices_filepath = "data/by_type/t2/train_indices.npy"
test_indices_filepath = "data/by_type/t2/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60
FILTER_BY_ZERO_THRESHOLD = 0.97

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [3]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

In [4]:
filtered_by_zero_values_train_data, filtered_by_zero_values_test_data, filtered_by_zero_healthy_train_data, filtered_by_zero_healthy_test_data, cancer_train_data, cancer_test_data = data_object.get_filtered_by_zero_data(train_data, test_data, FILTER_BY_ZERO_THRESHOLD)

Number of deleted columns: 714


### Defined classifiers

In [5]:
# Logistic Regression
log_clf = LogisticRegression(random_state=SEED,
                             class_weight='balanced',
                             penalty='l1',
                             C=0.0005,
                             solver='liblinear')

# XGBoost
xgb_clf = XGBClassifier(random_state=SEED,
                        booster='gbtree',
                        eta=0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5)

# K-Nearest Neighbors
knn_clf = KNeighborsClassifier(n_neighbors=5,
                               algorithm='kd_tree',
                               weights='distance')

# Isolation Forest
if_clf = IsolationForest(max_features=1,
                         n_estimators=1000,
                         bootstrap=True,
                         max_samples=1000,
                         n_jobs=12,
                         random_state=SEED)

# SVM
svm_clf = svm.SVC(kernel='linear',
                  class_weight='balanced',
                  probability=True,
                  random_state=SEED)

# Light GBM
lgbm_clf = LGBMClassifier(boosting_type='gbdt',
                          min_child_weight=0.001,
                          max_depth=10,
                          random_state=SEED,)

# Random Forest
rf_clf = RandomForestClassifier(random_state=SEED,
                                n_estimators=2000,
                                criterion="log_loss")

# Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(random_state=SEED,
                                         n_estimators=2000)

In [6]:
names = ["Logistic Regression", "XGBoost", "K-Nearest Neighbors", "Isolated Forest",
         "SVM", "Light GBM", "Random Forest", "Balanced Random Forest"]
    
classifiers = [log_clf, xgb_clf, knn_clf, if_clf, svm_clf, lgbm_clf, rf_clf, brf_clf]

### Regular data

In [6]:
run_all_models(classifiers, names, train_data, train_true_results, test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.977778,1.0,1.0,0.955556,0.977011
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.948103,0.0,0.0,0.0
Light GBM,0.977778,1.0,1.0,0.955556,0.977011
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Scaled data

In [7]:
run_all_models(classifiers, names, scaled_train_data, train_true_results, scaled_test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Logistic Regression, XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.866803,1.0,0.866667,0.927659
XGBoost,0.977778,1.0,1.0,0.955556,0.977011
K-Nearest Neighbors,0.499233,0.499233,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.710344,0.749271,0.748148,0.422222,0.534762
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Cut by max data

In [8]:
run_all_models(classifiers, names, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results)

Features number: 64
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.922222,1.0,1.0,0.844444,0.909524
K-Nearest Neighbors,0.588889,0.764833,1.0,0.177778,0.297214
Isolated Forest,0.975697,0.998277,1.0,0.951394,0.975075
SVM,0.660749,0.713601,0.071626,0.377778,0.119164
Light GBM,0.977778,0.999983,1.0,0.955556,0.977011
Random Forest,0.933333,1.0,1.0,0.866667,0.927659
Balanced Random Forest,0.999488,1.0,0.919118,1.0,0.957661


### Pca reduced + cut by max data

In [9]:
run_all_models(classifiers, names, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results)

Features number: 60
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Isolated Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.877778,0.999906,1.0,0.755556,0.860399
K-Nearest Neighbors,0.588889,0.76485,1.0,0.177778,0.297214
Isolated Forest,0.961371,0.998482,1.0,0.922742,0.959781
SVM,0.652307,0.72965,0.057583,0.377778,0.099295
Light GBM,0.9,1.0,1.0,0.8,0.885714
Random Forest,0.922222,1.0,1.0,0.844444,0.915344
Balanced Random Forest,0.999488,1.0,0.919118,1.0,0.957661


### P-values regular data

In [10]:
statistics = calculate_statistics(train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_regular_train_data = train_data[statistics.index]
p_values_regular_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_regular_train_data, train_true_results, p_values_regular_test_data, test_true_results)

Features number: 1516
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,1.0,1.0,0.933333,0.965517
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.511111,0.941673,0.333333,0.022222,0.041667
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values scaled data

In [11]:
p_values_scaled_train_data = scaled_train_data[statistics.index]
p_values_scaled_test_data = scaled_test_data[statistics.index]

run_all_models(classifiers, names, p_values_scaled_train_data, train_true_results, p_values_scaled_test_data, test_true_results)

Features number: 1516
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Logistic Regression, XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.866803,1.0,0.866667,0.927659
XGBoost,0.966667,1.0,1.0,0.933333,0.965517
K-Nearest Neighbors,0.499233,0.499233,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.721455,0.798806,0.764021,0.444444,0.559293
Light GBM,0.966667,1.0,1.0,0.933333,0.965517
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values cut by max

In [12]:
statistics = calculate_statistics(cut_by_max_train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_cut_by_max_train_data = train_data[statistics.index]
p_values_cut_by_max_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_cut_by_max_train_data, train_true_results, p_values_cut_by_max_test_data, test_true_results)

Features number: 25
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Isolated Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,0.999983,1.0,0.933333,0.965517
K-Nearest Neighbors,0.687994,0.872977,0.702742,0.377778,0.483627
Isolated Forest,0.971476,0.999915,1.0,0.942952,0.970625
SVM,0.578486,0.662932,0.018553,0.4,0.035456
Light GBM,0.977522,0.999949,0.956944,0.955556,0.955531
Random Forest,0.955556,1.0,1.0,0.911111,0.951469
Balanced Random Forest,0.997953,1.0,0.749269,1.0,0.853558


### P-values pca reduced

In [13]:
statistics = calculate_statistics(pca_reduced_train_data, train_true_results)
statistics = statistics[statistics["p-values"] < 0.05]
p_values_pca_reduced_train_data = train_data[statistics.index]
p_values_pca_reduced_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_pca_reduced_train_data, train_true_results, p_values_pca_reduced_test_data, test_true_results)

Features number: 24
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: XGBoost, Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.966667,0.999889,1.0,0.933333,0.965517
K-Nearest Neighbors,0.688121,0.872985,0.742424,0.377778,0.490842
Isolated Forest,0.960732,0.999847,1.0,0.921463,0.959104
SVM,0.578486,0.662983,0.018549,0.4,0.035448
Light GBM,0.966667,0.999983,1.0,0.933333,0.965517
Random Forest,0.888889,1.0,1.0,0.777778,0.874644
Balanced Random Forest,0.998209,0.999932,0.772947,1.0,0.869219


### Logistic regression coefficients

In [14]:
logistic_regression_features = pd.read_csv("features/logistic_regression.csv", index_col=0)
logistic_regression_features = logistic_regression_features[(logistic_regression_features != 0).all(1)]
logistic_regression_features = logistic_regression_features.sort_values(by="feature_importance_vals", ascending=False)

lg_train_data = train_data[logistic_regression_features.index[:25]]
lg_test_data = test_data[logistic_regression_features.index[:25]]
run_all_models(classifiers, names, lg_train_data, train_true_results, lg_test_data, test_true_results)

Features number: 25
Best balanced accuracy: Random Forest, Balanced Random Forest
Best ROC AUC: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.911111,0.999983,1.0,0.822222,0.900187
K-Nearest Neighbors,0.5,0.622222,0.0,0.0,0.0
Isolated Forest,0.999488,1.0,1.0,0.998977,0.999488
SVM,0.812245,0.782212,0.285394,0.644444,0.39211
Light GBM,0.922222,1.0,1.0,0.844444,0.909524
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### First PCA component coefficients

In [15]:
pca_features = pd.read_csv("features/pca.csv", index_col=0)
pca_features = pca_features[(pca_features != 0).all(1)]
pca_features.sort_values(by="feature_importance_vals", ascending=False)

pca_train_data = train_data[pca_features.index[:100]]
pca_test_data = test_data[pca_features.index[:100]]
run_all_models(classifiers, names, pca_train_data, train_true_results, pca_test_data, test_true_results)

Features number: 100
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Balanced Random Forest
Best F1 score: Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.9,1.0,1.0,0.8,0.885942
K-Nearest Neighbors,0.5,0.587831,0.0,0.0,0.0
Isolated Forest,0.918138,1.0,1.0,0.836275,0.910814
SVM,0.537537,0.650294,0.066834,0.088889,0.076253
Light GBM,0.955556,1.0,1.0,0.911111,0.952381
Random Forest,0.988889,1.0,1.0,0.977778,0.988506
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### XGBoost feature importance

In [16]:
pca_reduced_xgboost_features = pd.read_csv("features/pca_reduced_xgboost.csv", index_col=0)
pca_reduced_xgboost_features = pca_reduced_xgboost_features[(pca_reduced_xgboost_features != 0).all(1)]
pca_reduced_xgboost_features.sort_values(by="feature_importance_vals", ascending=False)

xgb_train_data = train_data[pca_reduced_xgboost_features.index]
xgb_test_data = test_data[pca_reduced_xgboost_features.index]
run_all_models(classifiers, names, xgb_train_data, train_true_results, xgb_test_data, test_true_results)

Features number: 11
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Isolated Forest, Random Forest
Best precision: XGBoost, K-Nearest Neighbors, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Isolated Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.855556,0.999983,1.0,0.711111,0.830769
K-Nearest Neighbors,0.611111,0.888198,1.0,0.222222,0.356209
Isolated Forest,0.974546,1.0,1.0,0.949092,0.973879
SVM,0.581555,0.474682,0.049281,0.4,0.077058
Light GBM,0.911111,0.999983,1.0,0.822222,0.897436
Random Forest,0.944444,1.0,1.0,0.888889,0.940887
Balanced Random Forest,0.996291,0.999778,0.615588,1.0,0.759775


### LightGBM feature importance

In [17]:
pca_reduced_lightgbm_features = pd.read_csv("features/pca_reduced_lightgbm.csv", index_col=0)
pca_reduced_lightgbm_features = pca_reduced_lightgbm_features[(pca_reduced_lightgbm_features != 0).all(1)]
pca_reduced_lightgbm_features.sort_values(by="feature_importance_vals", ascending=False)

lgbm_train_data = train_data[pca_reduced_lightgbm_features.index]
lgbm_test_data = test_data[pca_reduced_lightgbm_features.index]
run_all_models(classifiers, names, lgbm_train_data, train_true_results, lgbm_test_data, test_true_results)

Features number: 55
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest
Best recall: Balanced Random Forest
Best F1 score: Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.855556,1.0,1.0,0.711111,0.828348
K-Nearest Neighbors,0.566539,0.753014,0.888889,0.133333,0.230937
Isolated Forest,0.949987,0.998243,1.0,0.899974,0.947321
SVM,0.631747,0.614155,0.050854,0.333333,0.0881
Light GBM,0.922222,1.0,1.0,0.844444,0.915344
Random Forest,0.888889,1.0,1.0,0.777778,0.874644
Balanced Random Forest,0.999488,1.0,0.919118,1.0,0.957661


### Random forest feature importance

In [18]:
random_forest_regular_features = pd.read_csv("features/regular_random_forest.csv", index_col=0)
random_forest_regular_features = random_forest_regular_features[(random_forest_regular_features != 0).all(1)]
random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

rf_train_data = train_data[random_forest_regular_features.index[:50]]
rf_test_data = test_data[random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, rf_train_data, train_true_results, rf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.922222,1.0,1.0,0.844444,0.914432
K-Nearest Neighbors,0.5,0.522222,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.710472,0.628856,0.7886,0.422222,0.541015
Light GBM,0.977778,1.0,1.0,0.955556,0.977011
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Balanced random forest feature importance

In [19]:
balanced_random_forest_regular_features = pd.read_csv("features/regular_balanced_random_forest.csv", index_col=0)
balanced_random_forest_regular_features = balanced_random_forest_regular_features[(balanced_random_forest_regular_features != 0).all(1)]
balanced_random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

brf_train_data = train_data[balanced_random_forest_regular_features.index[:50]]
brf_test_data = test_data[balanced_random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, brf_train_data, train_true_results, brf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.955556,1.0,1.0,0.911111,0.953202
K-Nearest Neighbors,0.5,0.6,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.695907,0.630408,0.378968,0.4,0.381046
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with biggest error for autoencoder 

In [19]:
autoencoder_features = pd.read_csv("features/autoencoder_new.csv", index_col=0)
autoencoder_features.sort_values(by="feature_importance_vals", ascending=False)

autoencoder_train_data = train_data[autoencoder_features.index[:50]]
autoencoder_test_data = test_data[autoencoder_features.index[:50]]
run_all_models(classifiers, names, autoencoder_train_data, train_true_results, autoencoder_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: Isolated Forest, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.933333,0.933333,1.0,0.866667,0.925729
K-Nearest Neighbors,0.5,0.644444,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.644317,0.335414,0.888889,0.288889,0.431217
Light GBM,0.933333,0.933333,1.0,0.866667,0.925729
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with the highest variance between

In [22]:
variance_between_features = pd.read_csv("features/regular_variance_between.csv", index_col=0)
variance_between_features.sort_values(by="feature_importance_vals", ascending=False)

variance_between_train_data = train_data[variance_between_features.index[:50]]
variance_between_test_data = test_data[variance_between_features.index[:50]]
run_all_models(classifiers, names, variance_between_train_data, train_true_results, variance_between_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.911111,1.0,1.0,0.822222,0.901099
K-Nearest Neighbors,0.5,0.588889,0.0,0.0,0.0
Isolated Forest,0.999744,1.0,1.0,0.999488,0.999744
SVM,0.754532,0.69097,0.748077,0.511111,0.600994
Light GBM,0.966667,1.0,1.0,0.933333,0.964696
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Filtered by zero values for regular data

In [8]:
run_all_models(classifiers, names, filtered_by_zero_values_train_data, train_true_results, filtered_by_zero_values_test_data, test_true_results)

Features number: 1286
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.0,0.0,0.0
XGBoost,0.988889,1.0,1.0,0.977778,0.988506
K-Nearest Neighbors,0.5,0.5,0.0,0.0,0.0
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.835064,0.0,0.0,0.0
Light GBM,0.988889,1.0,1.0,0.977778,0.988506
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0
