In [1]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import Model

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

2022-12-28 00:23:57.501091: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
#data_object.generate_train_test_split() #generate train and test split indices' files in the main folder
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [3]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

## Logistic Regression

### Regular data

In [4]:
log_clf = LogisticRegression(random_state=SEED, class_weight='balanced', penalty='l1', C=50,
                                 solver='liblinear')
logistic_regression_model_object = Model(log_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
logistic_regression_model_object.main_cycle()

In [5]:
logistic_regression_model_object.display_test_results()

Test balanced accuracy 0.622094312270828

Test roc auc 0.5923936215570904

Test precision 0.8888888888888888

Test recall 0.24444444444444446

Test f1 score 0.37566137566137564


### Scaled data

In [6]:
log_clf = LogisticRegression(random_state=SEED, class_weight='balanced', penalty='l1', C=0.0005,
                                 solver='liblinear')
logistic_regression_model_object = Model(log_clf, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
logistic_regression_model_object.main_cycle()

In [7]:
logistic_regression_model_object.display_test_results()

Test balanced accuracy 0.9666666666666667

Test roc auc 0.9334186066342628

Test precision 1.0

Test recall 0.9333333333333332

Test f1 score 0.9655172413793104


### Cut by max data

In [8]:
log_clf = LogisticRegression(random_state=SEED, class_weight='balanced', penalty='l1', C=50,
                                 solver='liblinear')
logistic_regression_model_object = Model(log_clf, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
logistic_regression_model_object.main_cycle()

In [9]:
logistic_regression_model_object.display_test_results()

Test balanced accuracy 0.663161933998465

Test roc auc 0.7018845399505415

Test precision 0.05925925925925926

Test recall 0.4000000000000001

Test f1 score 0.10293819655521781


### Pca reduced + cut by max data

In [10]:
log_clf = LogisticRegression(random_state=SEED, class_weight='balanced', penalty='l1', C=50,
                                 solver='liblinear')
logistic_regression_model_object = Model(log_clf, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
logistic_regression_model_object.main_cycle()

In [11]:
logistic_regression_model_object.display_test_results()

Test balanced accuracy 0.6735823313720474

Test roc auc 0.7002814018930673

Test precision 0.039240962195023654

Test recall 0.48888888888888893

Test f1 score 0.07246937747523302


## XGBoost

In [12]:
# not sure about parameters
# needs to be fixed
xgb_clf = XGBClassifier(booster='gbtree', # tree-based model
                        eta = 0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5,
                        random_state=SEED)

### Regular data

In [13]:
xgboost_model_object = Model(xgb_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [14]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 1.0

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Scaled data

In [15]:
xgboost_model_object = Model(xgb_clf, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [16]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 1.0

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Cut by max data

In [17]:
xgboost_model_object = Model(xgb_clf, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [18]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9777777777777779

Test roc auc 1.0

Test precision 1.0

Test recall 0.9555555555555556

Test f1 score 0.9770114942528737


### Pca reduced + cut by max data

In [19]:
xgboost_model_object = Model(xgb_clf, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [20]:
xgboost_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


## K-nearest neighbors
KNN provides four search algorithms: 'ball_tree', 'kd_tree', 'brute' and 'auto' (the algorithm attempts to determine the best approach from the training data) and two weight types: 'uniform' (default value) and 'distance' (it generated better results than 'uniform' for above data). KNN did not turn out to be a sufficient model for the problem.

### KNN regular data
Precision and recall for regular data were 0.0 for all algorithm and weight types.

In [21]:
import warnings
warnings.filterwarnings('ignore')

knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  train_data,
                  train_true_results,                       
                  test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [22]:
knn_model.display_test_results()

Test balanced accuracy 0.5

Test roc auc 0.5

Test precision 0.0

Test recall 0.0

Test f1 score 0.0


### KNN Scaled Data

In [23]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  scaled_train_data,
                  train_true_results,
                  scaled_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [24]:
knn_model.display_test_results()

Test balanced accuracy 0.5

Test roc auc 0.4994883601944231

Test precision 0.0

Test recall 0.0

Test f1 score 0.0


### KNN Cut by max data

In [25]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  cut_by_max_train_data,
                  train_true_results,
                  cut_by_max_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [26]:
knn_model.display_test_results()

Test balanced accuracy 0.5222222222222223

Test roc auc 0.7426366504647396

Test precision 0.6666666666666666

Test recall 0.044444444444444446

Test f1 score 0.08333333333333333


## KNN PCA reduced + cut by max data
This is the only data where validation precision and recall have values other than zero but still less than 0.5. Therefore, those results are not acceptable.


In [27]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance')
knn_model = Model(knn,
                  pca_reduced_train_data,
                  train_true_results,
                  pca_reduced_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [28]:
knn_model.display_test_results()

Test balanced accuracy 0.5555555555555555

Test roc auc 0.8093630084420568

Test precision 1.0

Test recall 0.1111111111111111

Test f1 score 0.19444444444444445


## Isolation Forest

In [29]:
isolation_forest = IsolationForest(max_features=1, n_estimators=1000, bootstrap=True, max_samples=1000, n_jobs=12, random_state=SEED)

### Regular data

In [30]:
isolation_forest_model_object = Model(isolation_forest,
                                      train_data,
                                      train_true_results,
                                      test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [31]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


## Scaled data

In [32]:
isolation_forest_model_object = Model(isolation_forest,
                                      scaled_train_data,
                                      train_true_results,
                                      scaled_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [33]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


### Cut by max data

In [34]:
isolation_forest_model_object = Model(isolation_forest,
                                      cut_by_max_train_data,
                                      train_true_results,
                                      cut_by_max_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [35]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 0.9809414172422614

Test roc auc 0.9999658906796283

Test precision 1.0

Test recall 0.9618828344845229

Test f1 score 0.9805246975324273


### Pca reduced + cut by max data

In [36]:
isolation_forest_model_object = Model(isolation_forest,
                                      pca_reduced_train_data,
                                      train_true_results,
                                      pca_reduced_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [37]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 0.9556152468662061

Test roc auc 0.9998806173786988

Test precision 1.0

Test recall 0.9112304937324124

Test f1 score 0.9534893621271526


## SVM

In [38]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_true_results, test_size=0.3, random_state=109)
svm_clf = svm.SVC(kernel='linear', class_weight='balanced', probability=True, random_state=SEED)
svm_clf.fit(X_train, y_train.values.ravel())
y_pred = svm_clf.predict(X_test)

### Regular data

In [39]:
svm_model_object = Model(svm_clf, train_data, train_true_results.values.ravel(), test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [40]:
svm_model_object.display_test_results()

Test balanced accuracy 0.5

Test roc auc 0.8064466615502686

Test precision 0.0

Test recall 0.0

Test f1 score 0.0


### Scaled data

In [41]:
svm_model_object = Model(svm_clf, scaled_train_data, train_true_results.values.ravel(), scaled_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [42]:
svm_model_object.display_test_results()

Test balanced accuracy 0.6105994713055342

Test roc auc 0.7757994371962139

Test precision 0.6523809523809524

Test recall 0.22222222222222224

Test f1 score 0.32103386809269163


### Cut by max data

In [43]:
svm_model_object = Model(svm_clf, cut_by_max_train_data, train_true_results.values.ravel(), cut_by_max_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [44]:
svm_model_object.display_test_results()

Test balanced accuracy 0.6953440777692504

Test roc auc 0.7097979022767973

Test precision 0.06495364334347385

Test recall 0.4666666666666666

Test f1 score 0.1139547331497796


### Pca reduced + cut by max data

In [45]:
svm_model_object = Model(svm_clf, pca_reduced_train_data, train_true_results.values.ravel(), pca_reduced_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [46]:
svm_model_object.display_test_results()


Test balanced accuracy 0.6697450328302209

Test roc auc 0.47251641511042886

Test precision 0.03858712049217183

Test recall 0.48888888888888893

Test f1 score 0.07131542407814785


## LightGBM

In [47]:
lightgbm_clf = LGBMClassifier(
    boosting_type='gbdt',
    min_child_weight=0.001,
    max_depth=10,
    random_state=SEED)

### Regular data

In [48]:
lightgbm_model_object = Model(lightgbm_clf, train_data, train_true_results.values.ravel(), test_data, test_true_results.values.ravel(), FOLD_NUMBER)
lightgbm_model_object.main_cycle()

In [49]:
lightgbm_model_object.display_test_results()

Test balanced accuracy 0.9444444444444445

Test roc auc 1.0

Test precision 1.0

Test recall 0.888888888888889

Test f1 score 0.9391534391534392


### Scaled data

In [50]:
lightgbm_model_object = Model(lightgbm_clf, scaled_train_data, train_true_results.values.ravel(), scaled_test_data, test_true_results.values.ravel(), FOLD_NUMBER)
lightgbm_model_object.main_cycle()

In [51]:
lightgbm_model_object.display_test_results()

Test balanced accuracy 0.9666666666666667

Test roc auc 1.0

Test precision 1.0

Test recall 0.9333333333333332

Test f1 score 0.964696223316913


### Cut by max data

In [52]:
lightgbm_model_object = Model(lightgbm_clf, cut_by_max_train_data, train_true_results.values.ravel(), cut_by_max_test_data, test_true_results.values.ravel(), FOLD_NUMBER)
lightgbm_model_object.main_cycle()

In [53]:
lightgbm_model_object.display_test_results()

Test balanced accuracy 0.9888888888888889

Test roc auc 1.0

Test precision 1.0

Test recall 0.9777777777777779

Test f1 score 0.9885057471264368


### Pca reduced + cut by max data

In [54]:
lightgbm_model_object = Model(lightgbm_clf, pca_reduced_train_data, train_true_results.values.ravel(), pca_reduced_test_data, test_true_results.values.ravel(), FOLD_NUMBER)
lightgbm_model_object.main_cycle()

In [55]:
lightgbm_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


## Random Forest

In [56]:
random_forest_model = RandomForestClassifier(random_state=SEED, n_estimators=2000, criterion="log_loss")

### Regular data

In [57]:
random_forest_object = Model(random_forest_model, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
random_forest_object.main_cycle()

In [58]:
random_forest_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


### Scaled data

In [59]:
random_forest_object = Model(random_forest_model, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
random_forest_object.main_cycle()

In [60]:
random_forest_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


### Cut by max data

In [61]:
random_forest_object = Model(random_forest_model, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
random_forest_object.main_cycle()

In [62]:
random_forest_object.display_test_results()

Test balanced accuracy 0.9444444444444443

Test roc auc 1.0

Test precision 1.0

Test recall 0.888888888888889

Test f1 score 0.9408866995073891


### Pca reduced + cut by max data

In [63]:
random_forest_object = Model(random_forest_model, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
random_forest_object.main_cycle()

In [64]:
random_forest_object.display_test_results()

Test balanced accuracy 0.9555555555555556

Test roc auc 1.0

Test precision 1.0

Test recall 0.9111111111111111

Test f1 score 0.9523809523809524


## Balanced Random Forest

In [65]:
balanced_random_forest_model = BalancedRandomForestClassifier(random_state=SEED,  n_estimators=2000)

### Regular data

In [66]:
balanced_random_forest_object = Model(balanced_random_forest_model, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
balanced_random_forest_object.main_cycle()

In [67]:
balanced_random_forest_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


### Scaled data

In [68]:
balanced_random_forest_object = Model(balanced_random_forest_model, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
balanced_random_forest_object.main_cycle()

In [69]:
balanced_random_forest_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


### Cut by max data

In [70]:
balanced_random_forest_object = Model(balanced_random_forest_model, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
balanced_random_forest_object.main_cycle()

In [71]:
balanced_random_forest_object.display_test_results()

Test balanced accuracy 0.9982092606804809

Test roc auc 0.999982945339814

Test precision 0.7682083997873471

Test recall 1.0

Test f1 score 0.8674182203593969


### Pca reduced + cut by max data

In [72]:
balanced_random_forest_object = Model(balanced_random_forest_model, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
balanced_random_forest_object.main_cycle()

In [73]:
balanced_random_forest_object.display_test_results()

Test balanced accuracy 0.9970580711179329

Test roc auc 0.999982945339814

Test precision 0.6666666666666666

Test recall 1.0

Test f1 score 0.7985347985347985
