In [7]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import *
from utils import *

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [8]:
data_filepath = "data/by_type/t2/counts_ctc_simulated_123_5k_t2.tsv"
true_results_filepath = "data/by_type/t2/ids_ctc_simulated_123_5k_t2.tsv"
train_indices_filepath = "data/by_type/t2/train_indices.npy"
test_indices_filepath = "data/by_type/t2/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [9]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

### Defined classifiers

In [10]:
# Logistic Regression
log_clf = LogisticRegression(random_state=SEED,
                             class_weight='balanced',
                             penalty='l1',
                             C=0.0005,
                             solver='liblinear')

# XGBoost
xgb_clf = XGBClassifier(random_state=SEED,
                        booster='gbtree',
                        eta=0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5)

# K-Nearest Neighbors
knn_clf = KNeighborsClassifier(n_neighbors=5,
                               algorithm='kd_tree',
                               weights='distance')

# Isolation Forest
if_clf = IsolationForest(max_features=1,
                         n_estimators=1000,
                         bootstrap=True,
                         max_samples=1000,
                         n_jobs=12,
                         random_state=SEED)

# SVM
svm_clf = svm.SVC(kernel='linear',
                  class_weight='balanced',
                  probability=True,
                  random_state=SEED)

# Light GBM
lgbm_clf = LGBMClassifier(boosting_type='gbdt',
                          min_child_weight=0.001,
                          max_depth=10,
                          random_state=SEED,)

# Random Forest
rf_clf = RandomForestClassifier(random_state=SEED,
                                n_estimators=2000,
                                criterion="log_loss")

# Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(random_state=SEED,
                                         n_estimators=2000)

In [11]:
names = ["Logistic Regression", "XGBoost", "K-Nearest Neighbors", "Isolated Forest",
         "SVM", "Light GBM", "Random Forest", "Balanced Random Forest"]
    
classifiers = [log_clf, xgb_clf, knn_clf, if_clf, svm_clf, lgbm_clf, rf_clf, brf_clf]

### Regular data

In [14]:
run_all_models(classifiers, names, train_data, train_true_results, test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.977778,1.0,0.999495,0.999494,0.999486
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.948103,0.977368,0.988619,0.982961
Light GBM,0.977778,1.0,0.999495,0.999494,0.999486
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Scaled data

In [15]:
run_all_models(classifiers, names, scaled_train_data, train_true_results, scaled_test_data, test_true_results)

Features number: 2000
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.866803,0.998485,0.998483,0.998419
XGBoost,0.977778,1.0,0.999495,0.999494,0.999486
K-Nearest Neighbors,0.499233,0.499233,0.977351,0.987102,0.982202
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.710344,0.749271,0.990593,0.991907,0.990669
Light GBM,0.966667,1.0,0.999242,0.999241,0.999228
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Cut by max data

In [16]:
run_all_models(classifiers, names, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results)

Features number: 64
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Random Forest, Balanced Random Forest
Best precision: Light GBM
Best recall: Light GBM
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.922222,1.0,0.998235,0.99823,0.998087
K-Nearest Neighbors,0.588889,0.764833,0.990731,0.990642,0.987345
Isolated Forest,0.975697,0.998277,0.990834,0.951947,0.967678
SVM,0.660749,0.713601,0.982006,0.937279,0.957771
Light GBM,0.977778,0.999983,0.999495,0.999494,0.999486
Random Forest,0.933333,1.0,0.998485,0.998483,0.998419
Balanced Random Forest,0.999488,1.0,0.999079,0.998988,0.999012


### Pca reduced + cut by max data

In [17]:
run_all_models(classifiers, names, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results)

Features number: 60
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Light GBM, Random Forest, Balanced Random Forest
Best precision: Balanced Random Forest
Best recall: Balanced Random Forest
Best F1 score: Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.877778,0.999906,0.997226,0.997218,0.997022
K-Nearest Neighbors,0.588889,0.76485,0.990731,0.990642,0.987345
Isolated Forest,0.961371,0.998482,0.990122,0.923622,0.951508
SVM,0.652307,0.72965,0.981699,0.920587,0.948621
Light GBM,0.9,1.0,0.99773,0.997724,0.997563
Random Forest,0.922222,1.0,0.998233,0.99823,0.998152
Balanced Random Forest,0.999488,1.0,0.999079,0.998988,0.999012


### P-values regular data

In [18]:
statistics = calculate_statistics(train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
p_values_regular_train_data = train_data[statistics.index]
p_values_regular_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_regular_train_data, train_true_results, p_values_regular_test_data, test_true_results)

Features number: 637
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.911111,1.0,0.997983,0.997977,0.997829
K-Nearest Neighbors,0.5,0.577778,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.577778,0.891481,0.990483,0.990389,0.986841
Light GBM,0.944444,1.0,0.998738,0.998735,0.998685
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values scaled data

In [19]:
p_values_scaled_train_data = scaled_train_data[statistics.index]
p_values_scaled_test_data = scaled_test_data[statistics.index]

run_all_models(classifiers, names, p_values_scaled_train_data, train_true_results, p_values_scaled_test_data, test_true_results)

Features number: 637
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.933333,0.866803,0.998485,0.998483,0.998419
XGBoost,0.911111,1.0,0.997983,0.997977,0.997829
K-Nearest Neighbors,0.699233,0.843319,0.990333,0.991654,0.990374
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.821455,0.799864,0.99401,0.994436,0.994087
Light GBM,0.944444,1.0,0.998738,0.998735,0.998685
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### P-values cut by max

In [20]:
statistics = calculate_statistics(cut_by_max_train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
p_values_cut_by_max_train_data = train_data[statistics.index]
p_values_cut_by_max_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_cut_by_max_train_data, train_true_results, p_values_cut_by_max_test_data, test_true_results)

Features number: 30
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best precision: Light GBM
Best recall: Light GBM
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.966667,1.0,0.999242,0.999241,0.999228
K-Nearest Neighbors,0.633205,0.919596,0.991251,0.991401,0.988804
Isolated Forest,0.977872,0.998891,0.991009,0.956247,0.970174
SVM,0.637546,0.69745,0.98152,0.826252,0.892685
Light GBM,0.988889,1.0,0.999747,0.999747,0.999743
Random Forest,0.966667,1.0,0.999242,0.999241,0.999228
Balanced Random Forest,0.998849,1.0,0.998123,0.997724,0.997832


### P-values pca reduced

In [21]:
statistics = calculate_statistics(pca_reduced_train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
p_values_pca_reduced_train_data = train_data[statistics.index]
p_values_pca_reduced_test_data = test_data[statistics.index]

run_all_models(classifiers, names, p_values_pca_reduced_train_data, train_true_results, p_values_pca_reduced_test_data, test_true_results)

Features number: 27
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Random Forest, Balanced Random Forest
Best precision: Random Forest
Best recall: Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.844444,0.999403,0.996474,0.996459,0.996066
K-Nearest Neighbors,0.644061,0.929564,0.990178,0.991148,0.988885
Isolated Forest,0.928754,0.993587,0.989475,0.85913,0.91433
SVM,0.65246,0.778255,0.982012,0.790592,0.872309
Light GBM,0.922222,0.999181,0.998233,0.99823,0.998152
Random Forest,0.955556,1.0,0.99899,0.998988,0.998953
Balanced Random Forest,0.998593,1.0,0.997838,0.997218,0.997392


### Logistic regression coefficients

In [22]:
logistic_regression_features = pd.read_csv("features/logistic_regression.csv", index_col=0)
logistic_regression_features = logistic_regression_features[(logistic_regression_features != 0).all(1)]
logistic_regression_features = logistic_regression_features.sort_values(by="feature_importance_vals", ascending=False)

lg_train_data = train_data[logistic_regression_features.index[:25]]
lg_test_data = test_data[logistic_regression_features.index[:25]]
run_all_models(classifiers, names, lg_train_data, train_true_results, lg_test_data, test_true_results)

Features number: 25
Best balanced accuracy: Random Forest, Balanced Random Forest
Best ROC AUC: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Random Forest, Balanced Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.911111,0.999983,0.997982,0.997977,0.997854
K-Nearest Neighbors,0.5,0.622222,0.977368,0.988619,0.982961
Isolated Forest,0.999488,1.0,0.999079,0.998988,0.999012
SVM,0.812245,0.782212,0.987753,0.976227,0.981092
Light GBM,0.922222,1.0,0.998235,0.99823,0.998087
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### First PCA component coefficients

In [23]:
pca_features = pd.read_csv("features/pca.csv", index_col=0)
pca_features = pca_features[(pca_features != 0).all(1)]
pca_features.sort_values(by="feature_importance_vals", ascending=False)

pca_train_data = train_data[pca_features.index[:100]]
pca_test_data = test_data[pca_features.index[:100]]
run_all_models(classifiers, names, pca_train_data, train_true_results, pca_test_data, test_true_results)

Features number: 100
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Balanced Random Forest
Best recall: Balanced Random Forest
Best F1 score: Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.9,1.0,0.99773,0.997724,0.997565
K-Nearest Neighbors,0.5,0.587831,0.977368,0.988619,0.982961
Isolated Forest,0.918138,1.0,0.989369,0.838139,0.901855
SVM,0.537537,0.650294,0.978976,0.975974,0.977454
Light GBM,0.955556,1.0,0.99899,0.998988,0.998953
Random Forest,0.988889,1.0,0.999747,0.999747,0.999743
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### XGBoost feature importance

In [24]:
pca_reduced_xgboost_features = pd.read_csv("features/pca_reduced_xgboost.csv", index_col=0)
pca_reduced_xgboost_features = pca_reduced_xgboost_features[(pca_reduced_xgboost_features != 0).all(1)]
pca_reduced_xgboost_features.sort_values(by="feature_importance_vals", ascending=False)

xgb_train_data = train_data[pca_reduced_xgboost_features.index]
xgb_test_data = test_data[pca_reduced_xgboost_features.index]
run_all_models(classifiers, names, xgb_train_data, train_true_results, xgb_test_data, test_true_results)

Features number: 11
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Isolated Forest, Random Forest
Best precision: Random Forest
Best recall: Random Forest
Best F1 score: Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.855556,0.999983,0.996723,0.996712,0.996433
K-Nearest Neighbors,0.611111,0.888198,0.991228,0.991148,0.988267
Isolated Forest,0.974546,1.0,0.990722,0.949671,0.966344
SVM,0.581555,0.474682,0.980139,0.758978,0.845894
Light GBM,0.911111,0.999983,0.997983,0.997977,0.997823
Random Forest,0.944444,1.0,0.998737,0.998735,0.998695
Balanced Random Forest,0.996291,0.999778,0.995625,0.992666,0.993584


### LightGBM feature importance

In [25]:
pca_reduced_lightgbm_features = pd.read_csv("features/pca_reduced_lightgbm.csv", index_col=0)
pca_reduced_lightgbm_features = pca_reduced_lightgbm_features[(pca_reduced_lightgbm_features != 0).all(1)]
pca_reduced_lightgbm_features.sort_values(by="feature_importance_vals", ascending=False)

lgbm_train_data = train_data[pca_reduced_lightgbm_features.index]
lgbm_test_data = test_data[pca_reduced_lightgbm_features.index]
run_all_models(classifiers, names, lgbm_train_data, train_true_results, lgbm_test_data, test_true_results)

Features number: 55
Best balanced accuracy: Balanced Random Forest
Best ROC AUC: XGBoost, Light GBM, Random Forest, Balanced Random Forest
Best precision: Balanced Random Forest
Best recall: Balanced Random Forest
Best F1 score: Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.855556,1.0,0.996724,0.996712,0.996405
K-Nearest Neighbors,0.566539,0.753014,0.988967,0.989884,0.986214
Isolated Forest,0.949987,0.998243,0.989805,0.901113,0.938686
SVM,0.631747,0.614155,0.981119,0.923369,0.95006
Light GBM,0.922222,1.0,0.998233,0.99823,0.998152
Random Forest,0.888889,1.0,0.997477,0.997471,0.99731
Balanced Random Forest,0.999488,1.0,0.999079,0.998988,0.999012


### Random forest feature importance

In [26]:
random_forest_regular_features = pd.read_csv("features/regular_random_forest.csv", index_col=0)
random_forest_regular_features = random_forest_regular_features[(random_forest_regular_features != 0).all(1)]
random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

rf_train_data = train_data[random_forest_regular_features.index[:50]]
rf_test_data = test_data[random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, rf_train_data, train_true_results, rf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.922222,1.0,0.998233,0.99823,0.998142
K-Nearest Neighbors,0.5,0.522222,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.710472,0.628856,0.991056,0.99216,0.990867
Light GBM,0.977778,1.0,0.999495,0.999494,0.999486
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Balanced random forest feature importance

In [27]:
balanced_random_forest_regular_features = pd.read_csv("features/regular_balanced_random_forest.csv", index_col=0)
balanced_random_forest_regular_features = balanced_random_forest_regular_features[(balanced_random_forest_regular_features != 0).all(1)]
balanced_random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

brf_train_data = train_data[balanced_random_forest_regular_features.index[:50]]
brf_test_data = test_data[balanced_random_forest_regular_features.index[:50]]
run_all_models(classifiers, names, brf_train_data, train_true_results, brf_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Light GBM, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.955556,1.0,0.99899,0.998988,0.998962
K-Nearest Neighbors,0.5,0.6,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.695907,0.630408,0.986097,0.985078,0.985487
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with biggest error for autoencoder 

In [28]:
autoencoder_features = pd.read_csv("features/autoencoder_new.csv", index_col=0)
autoencoder_features.sort_values(by="feature_importance_vals", ascending=False)

autoencoder_train_data = train_data[autoencoder_features.index[:5]]
autoencoder_test_data = test_data[autoencoder_features.index[:5]]
run_all_models(classifiers, names, autoencoder_train_data, train_true_results, autoencoder_test_data, test_true_results)

Features number: 5
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.911111,1.0,0.997984,0.997977,0.99778
K-Nearest Neighbors,0.866667,0.911077,0.996974,0.996965,0.996734
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.788633,0.800153,0.994457,0.994689,0.994071
Light GBM,0.966667,0.977778,0.999242,0.999241,0.999228
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Features with the highest variance between

In [7]:
variance_between_features = pd.read_csv("features/regular_variance_between.csv", index_col=0)
variance_between_features.sort_values(by="feature_importance_vals", ascending=False)

variance_between_train_data = train_data[variance_between_features.index[:50]]
variance_between_test_data = test_data[variance_between_features.index[:50]]
run_all_models(classifiers, names, variance_between_train_data, train_true_results, variance_between_test_data, test_true_results)

Features number: 50
Best balanced accuracy: Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Random Forest, Balanced Random Forest
Best recall: Random Forest, Balanced Random Forest
Best F1 score: Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.911111,1.0,0.997981,0.997977,0.997864
K-Nearest Neighbors,0.5,0.588889,0.977368,0.988619,0.982961
Isolated Forest,0.999744,1.0,0.999526,0.999494,0.999502
SVM,0.754532,0.69097,0.99159,0.992413,0.991672
Light GBM,0.966667,1.0,0.999242,0.999241,0.999219
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Filtered by zero values

In [12]:
healthy_cells_train_indices = np.where(train_true_results == 0)[0]
healthy_train_data = train_data.iloc[healthy_cells_train_indices]

filtered_by_zero_values_train_data = train_data
filtered_by_zero_values_test_data = test_data
number_of_deleted_columns = 0

for (columnName, columnData) in train_data.iteritems():
    num_of_zero_values = (train_data.iloc[healthy_cells_train_indices][columnName]==0).sum()
    if num_of_zero_values > 0.97*len(train_data[columnName]):
        filtered_by_zero_values_train_data.drop(columns=[columnName], axis=1, inplace=True)
        filtered_by_zero_values_test_data.drop(columns=[columnName], axis=1, inplace=True)
        number_of_deleted_columns += 1

print(f'Number of deleted columns: {number_of_deleted_columns}')

Number of deleted columns: 714


In [13]:
run_all_models(classifiers, names, filtered_by_zero_values_train_data, train_true_results, filtered_by_zero_values_test_data, test_true_results)

Features number: 1286
Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: XGBoost, Isolated Forest, Light GBM, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.5,0.5,0.977368,0.988619,0.982961
XGBoost,0.988889,1.0,0.999747,0.999747,0.999743
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.835064,0.977368,0.988619,0.982961
Light GBM,0.988889,1.0,0.999747,0.999747,0.999743
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0
