In [1]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import *

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
#data_object.generate_train_test_split() #generate train and test split indices' files in the main folder
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [3]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

### Defined classifiers

In [4]:
# Logistic Regression
log_clf = LogisticRegression(random_state=0,
                             class_weight='balanced',
                             penalty='l1',
                             C=50,
                             solver='liblinear')

# XGBoost
xgb_clf = XGBClassifier(booster='gbtree',
                        eta=0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5)

# K-Nearest Neighbors
knn_clf = KNeighborsClassifier(n_neighbors=5,
                               algorithm='kd_tree',
                               weights='distance')

# Isolation Forest
if_clf = IsolationForest(max_features=1,
                         n_estimators=1000,
                         bootstrap=True,
                         max_samples=1000,
                         n_jobs=12,
                         random_state=SEED)

# SVM
svm_clf = svm.SVC(kernel='linear',
                  class_weight='balanced')

# Light GBM
lgbm_clf = LGBMClassifier(boosting_type='gbdt',
                          min_child_weight=0.001,
                          max_depth=10)

# Random Forest
rf_clf = RandomForestClassifier(random_state=SEED,
                                n_estimators=2000,
                                criterion="log_loss")

# Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(random_state=SEED,
                                         n_estimators=2000)

In [5]:
names = ["Logistic Regression", "XGBoost", "K-Nearest Neighbors", "Isolated Forest",
         "SVM", "Light GBM", "Random Forest", "Balanced Random Forest"]
    
classifiers = [log_clf, xgb_clf, knn_clf, if_clf, svm_clf, lgbm_clf, rf_clf, brf_clf]

### Regular data

In [6]:
run_all_models(classifiers, names, train_data, train_true_results, test_data, test_true_results)

Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: Isolated Forest, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.622094,0.622094,0.99021,0.991148,0.988488
XGBoost,0.922222,0.922222,0.998234,0.99823,0.998111
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.5,0.5,0.977368,0.988619,0.982961
Light GBM,0.944444,0.944444,0.998738,0.998735,0.998676
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Scaled data

In [7]:
run_all_models(classifiers, names, scaled_train_data, train_true_results, scaled_test_data, test_true_results)

Best balanced accuracy: Isolated Forest, Random Forest, Balanced Random Forest
Best ROC AUC: Isolated Forest, Random Forest, Balanced Random Forest
Best precision: Isolated Forest, Random Forest, Balanced Random Forest
Best recall: Isolated Forest, Random Forest, Balanced Random Forest
Best F1 score: Isolated Forest, Random Forest, Balanced Random Forest


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.799744,0.799744,0.994493,0.994942,0.994193
XGBoost,0.922222,0.922222,0.998234,0.99823,0.998111
K-Nearest Neighbors,0.5,0.5,0.977368,0.988619,0.982961
Isolated Forest,1.0,1.0,1.0,1.0,1.0
SVM,0.610599,0.610599,0.987264,0.990137,0.987361
Light GBM,0.966667,0.966667,0.999242,0.999241,0.999219
Random Forest,1.0,1.0,1.0,1.0,1.0
Balanced Random Forest,1.0,1.0,1.0,1.0,1.0


### Cut by max data

In [8]:
run_all_models(classifiers, names, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results)

Best balanced accuracy: Balanced Random Forest
Best ROC AUC: Balanced Random Forest
Best precision: Light GBM
Best recall: Light GBM
Best F1 score: Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.663162,0.663162,0.981982,0.920334,0.948545
XGBoost,0.977778,0.977778,0.999495,0.999494,0.999486
K-Nearest Neighbors,0.522222,0.522222,0.98545,0.989125,0.98416
Isolated Forest,0.980941,0.980941,0.991426,0.962317,0.973832
SVM,0.695344,0.695344,0.982844,0.918816,0.947847
Light GBM,0.988889,0.988889,0.999747,0.999747,0.999743
Random Forest,0.944444,0.944444,0.998737,0.998735,0.998695
Balanced Random Forest,0.998209,0.998209,0.997362,0.996459,0.996717


### Pca reduced + cut by max data

In [9]:
run_all_models(classifiers, names, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results)

Best balanced accuracy: XGBoost, Light GBM
Best ROC AUC: XGBoost, Light GBM
Best precision: XGBoost, Light GBM
Best recall: XGBoost, Light GBM
Best F1 score: XGBoost, Light GBM


Unnamed: 0,Balanced Accuracy,ROC AUC,Precision,Recall,F1 score
Logistic Regression,0.673582,0.673582,0.982341,0.854072,0.910884
XGBoost,1.0,1.0,1.0,1.0,1.0
K-Nearest Neighbors,0.555556,0.555556,0.989987,0.989884,0.9858
Isolated Forest,0.955615,0.955615,0.989953,0.912241,0.945021
SVM,0.669745,0.669745,0.982242,0.846485,0.906305
Light GBM,1.0,1.0,1.0,1.0,1.0
Random Forest,0.955556,0.955556,0.99899,0.998988,0.998953
Balanced Random Forest,0.997058,0.997058,0.996206,0.994183,0.99479
