In [1]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import *
from utils import *

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_filepath = "data/by_type/t1/counts_ctc_simulated_123_5k_t1.tsv"
true_results_filepath = "data/by_type/t1/ids_ctc_simulated_123_5k_t1.tsv"
train_indices_filepath = "data/by_type/t1/train_indices.npy"
test_indices_filepath = "data/by_type/t1/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [3]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

### Defined classifiers

In [4]:
# Logistic Regression
log_clf = LogisticRegression(random_state=SEED,
                             class_weight='balanced',
                             penalty='l1',
                             C=0.0005,
                             solver='liblinear')

# XGBoost
xgb_clf = XGBClassifier(random_state=SEED,
                        booster='gbtree',
                        eta=0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5)

# K-Nearest Neighbors
knn_clf = KNeighborsClassifier(n_neighbors=5,
                               algorithm='kd_tree',
                               weights='distance')

# Isolation Forest
if_clf = IsolationForest(max_features=1,
                         n_estimators=1000,
                         bootstrap=True,
                         max_samples=1000,
                         n_jobs=12,
                         random_state=SEED)

# SVM
svm_clf = svm.SVC(kernel='linear',
                  class_weight='balanced',
                  probability=True,
                  random_state=SEED)

# Light GBM
lgbm_clf = LGBMClassifier(boosting_type='gbdt',
                          min_child_weight=0.001,
                          max_depth=10,
                          random_state=SEED,)

# Random Forest
rf_clf = RandomForestClassifier(random_state=SEED,
                                n_estimators=2000,
                                criterion="log_loss")

# Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(random_state=SEED,
                                         n_estimators=2000)

In [5]:
names = ["Logistic Regression", "XGBoost", "K-Nearest Neighbors", "Isolated Forest",
         "SVM", "Light GBM", "Random Forest", "Balanced Random Forest"]
    
classifiers = [log_clf, xgb_clf, knn_clf, if_clf, svm_clf, lgbm_clf, rf_clf, brf_clf]

### Regular data

In [14]:
run_all_models(classifiers, names, train_data, train_true_results, test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.966667,1.0,0.999242,0.999241,0.999228
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.892112,0.977368,0.988619,0.982961
Light GBM,0.966667,1.0,0.999242,0.999241,0.999228
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Scaled data

In [15]:
run_all_models(classifiers, names, scaled_train_data, train_true_results, scaled_test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.889145,0.998485,0.998483,0.998419
XGBoost,0.966667,1.0,0.999242,0.999241,0.999228
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.688889,0.772022,0.992969,0.992919,0.991323
Light GBM,0.944444,1.0,0.998738,0.998735,0.998685
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Cut by max data

In [16]:
run_all_models(classifiers, names, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results)

Features number: 64
Best balanced accuracy: Random Forest
Best ROC AUC: XGBoost, Random Forest, Balanced Random Forest
Best precision: Random Forest
Best recall: Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.966667,1.0,0.999243,0.999241,0.9992
K-Nearest Neighbors,0.544444,0.776823,0.989739,0.989631,0.985303
Isolated Forest,0.968278,0.999045,0.990388,0.937279,0.95927
SVM,0.577616,0.690577,0.979758,0.946636,0.962312
Light GBM,0.999616,0.999983,0.999289,0.999241,0.999253
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.99936,1.0,0.99887,0.998735,0.998771


### Pca reduced + cut by max data

In [17]:
run_all_models(classifiers, names, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results)

Features number: 60
Best balanced accuracy: Random Forest
Best ROC AUC: XGBoost, Random Forest, Balanced Random Forest
Best precision: Random Forest
Best recall: Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.877778,1.0,0.997226,0.997218,0.997009
K-Nearest Neighbors,0.588889,0.832267,0.990731,0.990642,0.987299
Isolated Forest,0.94922,0.999011,0.989791,0.899595,0.937813
SVM,0.665976,0.717626,0.982136,0.925898,0.951426
Light GBM,0.988889,0.999983,0.999747,0.999747,0.999743
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.999744,1.0,0.999526,0.999494,0.999502


### P-values regular data

In [18]:
statistics = calculate_statistics(train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
p_values_regular_train_data = train_data[statistics.index]
p_values_regular_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_regular_train_data, train_true_results, p_values_regular_test_data, test_true_results)

Features number: 681
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.955556,1.0,0.99899,0.998988,0.998962
K-Nearest Neighbors,0.533333,0.766667,0.985698,0.989378,0.984703
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.633205,0.889963,0.99109,0.991401,0.989006
Light GBM,0.955556,1.0,0.99899,0.998988,0.998962
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values scaled data

In [19]:
p_values_scaled_train_data = scaled_train_data[statistics.index]
p_values_scaled_test_data = scaled_test_data[statistics.index]

run_all_models(classifiers, names, p_values_scaled_train_data, train_true_results, p_values_scaled_test_data, test_true_results)

Features number: 681
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.889145,0.998485,0.998483,0.998419
XGBoost,0.955556,1.0,0.99899,0.998988,0.998962
K-Nearest Neighbors,0.644444,0.744308,0.991973,0.991907,0.989681
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.766667,0.801143,0.994717,0.994689,0.993888
Light GBM,0.922222,1.0,0.998233,0.99823,0.998142
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values cut by max

In [20]:
statistics = calculate_statistics(cut_by_max_train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
p_values_cut_by_max_train_data = train_data[statistics.index]
p_values_cut_by_max_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_cut_by_max_train_data, train_true_results, p_values_cut_by_max_test_data, test_true_results)

Features number: 36
Best balanced accuracy: Light GBM
Best ROC AUC: Random Forest
Best precision: Random Forest
Best recall: Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.955428,0.999966,0.998736,0.998735,0.998713
K-Nearest Neighbors,0.588889,0.787576,0.99073,0.990642,0.987384
Isolated Forest,0.935661,0.989716,0.989556,0.872787,0.922354
SVM,0.702328,0.687559,0.983085,0.867476,0.918901
Light GBM,0.999616,0.999966,0.999289,0.999241,0.999253
Random Forest,0.988889,1.0,0.999747,0.999747,0.999743
Balanced Random Forest,0.99936,0.999983,0.99887,0.998735,0.998771


### P-values pca reduced

In [21]:
statistics = calculate_statistics(pca_reduced_train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
p_values_pca_reduced_train_data = train_data[statistics.index]
p_values_pca_reduced_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_pca_reduced_train_data, train_true_results, p_values_pca_reduced_test_data, test_true_results)

Features number: 35
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Random Forest, Balanced Random Forest
Best precision: Light GBM, Random Forest
Best recall: Light GBM, Random Forest
Best F1 score: Light GBM, Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.9,0.999881,0.997729,0.997724,0.997599
K-Nearest Neighbors,0.588889,0.78761,0.99073,0.990642,0.987384
Isolated Forest,0.921335,0.983713,0.989396,0.844461,0.905654
SVM,0.702328,0.687814,0.983085,0.867476,0.918896
Light GBM,0.911111,0.999949,0.997981,0.997977,0.997875
Random Forest,0.911111,1.0,0.997981,0.997977,0.997875
Balanced Random Forest,0.996546,1.0,0.99574,0.993171,0.993951


### Logistic regression coefficients

In [22]:
logistic_regression_features = pd.read_csv("features/logistic_regression.csv", index_col=0)
logistic_regression_features = logistic_regression_features[(logistic_regression_features != 0).all(1)]
logistic_regression_features = logistic_regression_features.sort_values(by="feature_importance_vals", ascending=False)

lg_train_data = train_data[logistic_regression_features.index[:25]]
lg_test_data = test_data[logistic_regression_features.index[:25]]
run_all_models(classifiers, names, lg_train_data, train_true_results, lg_test_data, test_true_results)

Features number: 25
Best balanced accuracy: Random Forest, Balanced Random Forest
Best ROC AUC: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Random Forest, Balanced Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.966667,0.999949,0.999242,0.999241,0.999228
K-Nearest Neighbors,0.5,0.555445,0.977368,0.988619,0.982961
Isolated Forest,0.999616,1.0,0.999289,0.999241,0.999253
SVM,0.749544,0.698832,0.988597,0.982549,0.984912
Light GBM,0.966667,1.0,0.999242,0.999241,0.999228
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### First PCA component coefficients

In [23]:
pca_features = pd.read_csv("features/pca.csv", index_col=0)
pca_features = pca_features[(pca_features != 0).all(1)]
pca_features.sort_values(by="feature_importance_vals", ascending=False)

pca_train_data = train_data[pca_features.index[:100]]
pca_test_data = test_data[pca_features.index[:100]]
run_all_models(classifiers, names, pca_train_data, train_true_results, pca_test_data, test_true_results)

Features number: 100
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Light GBM
Best recall: Light GBM
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.911111,1.0,0.997981,0.997977,0.997875
K-Nearest Neighbors,0.733333,0.92212,0.993968,0.99393,0.992815
Isolated Forest,0.896265,1.0,0.989219,0.794891,0.875294
SVM,0.638288,0.759359,0.981624,0.958017,0.968952
Light GBM,0.977778,1.0,0.999495,0.999494,0.999476
Random Forest,0.966667,1.0,0.999242,0.999241,0.999219
Balanced Random Forest,0.999233,1.0,0.998661,0.998483,0.998529


### XGBoost feature importance

In [24]:
pca_reduced_xgboost_features = pd.read_csv("features/pca_reduced_xgboost.csv", index_col=0)
pca_reduced_xgboost_features = pca_reduced_xgboost_features[(pca_reduced_xgboost_features != 0).all(1)]
pca_reduced_xgboost_features.sort_values(by="feature_importance_vals", ascending=False)

xgb_train_data = train_data[pca_reduced_xgboost_features.index]
xgb_test_data = test_data[pca_reduced_xgboost_features.index]
run_all_models(classifiers, names, xgb_train_data, train_true_results, xgb_test_data, test_true_results)

Features number: 11
Best balanced accuracy: XGBoost, Light GBM, Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Light GBM, Random Forest
Best recall: XGBoost, Light GBM, Random Forest
Best F1 score: XGBoost, Light GBM, Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.722222,0.944086,0.993718,0.993677,0.992443
Isolated Forest,0.966232,1.0,0.990281,0.933232,0.956963
SVM,0.584941,0.559546,0.980361,0.678806,0.795826
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.998721,1.0,0.997937,0.997471,0.997598


### LightGBM feature importance

In [25]:
pca_reduced_lightgbm_features = pd.read_csv("features/pca_reduced_lightgbm.csv", index_col=0)
pca_reduced_lightgbm_features = pca_reduced_lightgbm_features[(pca_reduced_lightgbm_features != 0).all(1)]
pca_reduced_lightgbm_features.sort_values(by="feature_importance_vals", ascending=False)

lgbm_train_data = train_data[pca_reduced_lightgbm_features.index]
lgbm_test_data = test_data[pca_reduced_lightgbm_features.index]
run_all_models(classifiers, names, lgbm_train_data, train_true_results, lgbm_test_data, test_true_results)

Features number: 55
Best balanced accuracy: Light GBM, Random Forest
Best ROC AUC: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best precision: Light GBM, Random Forest
Best recall: Light GBM, Random Forest
Best F1 score: Light GBM, Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.988889,1.0,0.999747,0.999747,0.999743
K-Nearest Neighbors,0.655556,0.876772,0.992222,0.99216,0.990105
Isolated Forest,0.932847,0.998499,0.989525,0.867223,0.919077
SVM,0.697135,0.736233,0.982933,0.922357,0.949714
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,0.999744,1.0,0.999526,0.999494,0.999502


### Random forest feature importance

In [26]:
random_forest_regular_features = pd.read_csv("features/regular_random_forest.csv", index_col=0)
random_forest_regular_features = random_forest_regular_features[(random_forest_regular_features != 0).all(1)]
random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

rf_train_data = train_data[random_forest_regular_features.index[:50]]
rf_test_data = test_data[random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, rf_train_data, train_true_results, rf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.955556,1.0,0.99899,0.998988,0.998962
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.70932,0.730349,0.989484,0.989884,0.989242
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Balanced random forest feature importance

In [27]:
balanced_random_forest_regular_features = pd.read_csv("features/regular_balanced_random_forest.csv", index_col=0)
balanced_random_forest_regular_features = balanced_random_forest_regular_features[(balanced_random_forest_regular_features != 0).all(1)]
balanced_random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

brf_train_data = train_data[balanced_random_forest_regular_features.index[:50]]
brf_test_data = test_data[balanced_random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, brf_train_data, train_true_results, brf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Light GBM, Random Forest, Balanced Random Forest
Best recall: Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.977778,1.0,0.999495,0.999494,0.999486
K-Nearest Neighbors,0.611111,0.866581,0.991227,0.991148,0.988339
Isolated Forest,0.999872,1.0,0.999763,0.999747,0.999751
SVM,0.724252,0.590296,0.985261,0.975974,0.98014
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with biggest error for autoencoder 

In [28]:
autoencoder_features = pd.read_csv("features/autoencoder_new.csv", index_col=0)
autoencoder_features.sort_values(by="feature_importance_vals", ascending=False)

autoencoder_train_data = train_data[autoencoder_features.index[:5]]
autoencoder_test_data = test_data[autoencoder_features.index[:5]]
run_all_models(classifiers, names, autoencoder_train_data, train_true_results, autoencoder_test_data, test_true_results)

Features number: 5
Best balanced accuracy: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best recall: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: XGBoost, Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.954532,0.999233,0.997113,0.996965,0.997012
Isolated Forest,0.999744,1.0,0.999526,0.999494,0.999502
SVM,0.676882,0.799488,0.989778,0.990895,0.989331
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with the highest variance between

In [6]:
variance_between_features = pd.read_csv("features/regular_variance_between.csv", index_col=0)
variance_between_features.sort_values(by="feature_importance_vals", ascending=False)

variance_between_train_data = train_data[variance_between_features.index[:50]]
variance_between_test_data = test_data[variance_between_features.index[:50]]
run_all_models(classifiers, names, variance_between_train_data, train_true_results, variance_between_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Light GBM, Random Forest, Balanced Random Forest
Best recall: Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.944444,1.0,0.998739,0.998735,0.998654
K-Nearest Neighbors,0.5,0.633333,0.977368,0.988619,0.982961
Isolated Forest,0.999744,1.0,0.999554,0.999494,0.99951
SVM,0.782621,0.664381,0.987735,0.982802,0.984905
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Filtered by zero values

In [7]:
healthy_cells_train_indices = np.where(train_true_results == 0)[0]
healthy_train_data = train_data.iloc[healthy_cells_train_indices]

filtered_by_zero_values_train_data = train_data
filtered_by_zero_values_test_data = test_data
number_of_deleted_columns = 0

for (columnName, columnData) in train_data.iteritems():
    num_of_zero_values = (train_data.iloc[healthy_cells_train_indices][columnName]==0).sum()
    if num_of_zero_values > 0.97*len(train_data[columnName]):
        filtered_by_zero_values_train_data.drop(columns=[columnName], axis=1, inplace=True)
        filtered_by_zero_values_test_data.drop(columns=[columnName], axis=1, inplace=True)
        number_of_deleted_columns += 1

print(f'Number of deleted columns: {number_of_deleted_columns}')

Number of deleted columns: 714


In [8]:
run_all_models(classifiers, names, filtered_by_zero_values_train_data, train_true_results, filtered_by_zero_values_test_data, test_true_results)

Features number: 1286
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.966667,1.0,0.999242,0.999241,0.999228
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.499872,0.885393,0.977365,0.988366,0.982835
Light GBM,0.966667,1.0,0.999242,0.999241,0.999228
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0
