In [153]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import Model

from xgboost import XGBClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [154]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [155]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

## XGBoost

In [156]:
# not sure about parameters
# needs to be fixed
xgb_clf = XGBClassifier(booster='gbtree', # tree-based model
                        eta = 0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5
                       )

### Regular data

In [157]:
xgboost_model_object = Model(xgb_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0



In [158]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117
Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Scaled data

In [159]:
xgboost_model_object = Model(xgb_clf, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0



In [160]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117
Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Cut by max data

In [161]:
xgboost_model_object = Model(xgb_clf, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.9583333333333334
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9166666666666666
Training recall 1.0

Validation f1 0.9552042160737813
Training f1 1.0

Validation balanced accuracy 0.9583333333333334
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9166666666666666
Training recall 1.0

Validation f1 0.9552042160737813
Training f1 1.0



In [162]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9777777777777779

Test roc auc 0.9777777777777779

Test precision 1.0

Test recall 0.9555555555555556

Test f1 score 0.9770114942528737
Test balanced accuracy 0.9777777777777779

Test roc auc 0.9777777777777779

Test precision 1.0

Test recall 0.9555555555555556

Test f1 score 0.9770114942528737


### Pca reduced + cut by max data

In [163]:
xgboost_model_object = Model(xgb_clf, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.970959595959596
Training balanced accuracy 1.0

Validation roc auc 0.9999725786991335
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9419191919191919
Training recall 1.0

Validation f1 0.9696342305037957
Training f1 1.0

Validation balanced accuracy 0.970959595959596
Training balanced accuracy 1.0

Validation roc auc 0.9999725786991335
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9419191919191919
Training recall 1.0

Validation f1 0.9696342305037957
Training f1 1.0



In [164]:
xgboost_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0
Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


## K-nearest neighbors
KNN provides four search algorithms: 'ball_tree', 'kd_tree', 'brute' and 'auto' (the algorithm attempts to determine the best approach from the training data) and two weight types: 'uniform' (default value) and 'distance' (it generated better results than 'uniform' for above data). KNN did not turn out to be a sufficient model for the problem.

### KNN regular data
Precision and recall for regular data were 0.0 for all algorithm and weight types.

In [127]:
import warnings
warnings.filterwarnings('ignore')

knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  train_data,
                  train_true_results,
                  test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

Validation balanced accuracy 0.5
Training balanced accuracy 1.0

Validation roc auc 0.5
Training roc auc 1.0

Validation precision 0.0
Training precision 1.0

Validation recall 0.0
Training recall 1.0

Validation f1 0.0
Training f1 1.0



In [128]:
knn_model.display_test_results()

Test balanced accuracy 0.5

Test roc auc 0.5

Test precision 0.0

Test recall 0.0

Test f1 score 0.0


### KNN Scaled Data

In [129]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  scaled_train_data,
                  train_true_results,
                  scaled_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

Validation balanced accuracy 0.5
Training balanced accuracy 1.0

Validation roc auc 0.5
Training roc auc 1.0

Validation precision 0.0
Training precision 1.0

Validation recall 0.0
Training recall 1.0

Validation f1 0.0
Training f1 1.0



In [130]:
knn_model.display_test_results()

Test balanced accuracy 0.5

Test roc auc 0.5

Test precision 0.0

Test recall 0.0

Test f1 score 0.0


### KNN Cut by max data

In [131]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  cut_by_max_train_data,
                  train_true_results,
                  cut_by_max_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

Validation balanced accuracy 0.5
Training balanced accuracy 1.0

Validation roc auc 0.6688927777279208
Training roc auc 1.0

Validation precision 0.0
Training precision 1.0

Validation recall 0.0
Training recall 1.0

Validation f1 0.0
Training f1 1.0



In [132]:
knn_model.display_test_results()

Test balanced accuracy 0.5222222222222223

Test roc auc 0.5222222222222223

Test precision 0.6666666666666666

Test recall 0.044444444444444446

Test f1 score 0.08333333333333333


## KNN PCA reduced + cut by max data
This is the only data where validation precision and recall have values other than zero but still less than 0.5. Therefore, those results are not acceptable.


In [133]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance')
knn_model = Model(knn,
                  pca_reduced_train_data,
                  train_true_results,
                  pca_reduced_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

Validation balanced accuracy 0.5151515151515151
Training balanced accuracy 1.0

Validation roc auc 0.6839046935295702
Training roc auc 1.0

Validation precision 0.3333333333333333
Training precision 1.0

Validation recall 0.030303030303030304
Training recall 1.0

Validation f1 0.05555555555555556
Training f1 1.0



In [134]:
knn_model.display_test_results()

Test balanced accuracy 0.5555555555555555

Test roc auc 0.5555555555555555

Test precision 1.0

Test recall 0.1111111111111111

Test f1 score 0.19444444444444445


## Isolation Forest

In [135]:
isolation_forest = IsolationForest(max_features=1, n_estimators=1000, bootstrap=True, max_samples=1000, n_jobs=12, random_state=SEED)

### Regular data

In [136]:
isolation_forest_model_object = Model(isolation_forest,
                                      train_data,
                                      train_true_results,
                                      test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

Validation balanced accuracy 1.0
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 1.0
Training recall 1.0

Validation f1 1.0
Training f1 1.0



In [137]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


## Scaled data

In [138]:
isolation_forest_model_object = Model(isolation_forest,
                                      scaled_train_data,
                                      train_true_results,
                                      scaled_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

Validation balanced accuracy 1.0
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 1.0
Training recall 1.0

Validation f1 1.0
Training f1 1.0



In [139]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


### Cut by max data

In [140]:
isolation_forest_model_object = Model(isolation_forest,
                                      cut_by_max_train_data,
                                      train_true_results,
                                      cut_by_max_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

Validation balanced accuracy 0.9861638262523312
Training balanced accuracy 0.9897208546474147

Validation roc auc 0.9999481131545108
Training roc auc 0.9999690230734117

Validation precision 1.0
Training precision 1.0

Validation recall 0.9723276525046622
Training recall 0.9794417092948294

Validation f1 0.9859117316625624
Training f1 0.9895972343546561



In [141]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 0.9809414172422614

Test roc auc 0.9809414172422614

Test precision 1.0

Test recall 0.9618828344845229

Test f1 score 0.9805246975324273


### Pca reduced + cut by max data

In [142]:
isolation_forest_model_object = Model(isolation_forest,
                                      pca_reduced_train_data,
                                      train_true_results,
                                      pca_reduced_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

Validation balanced accuracy 0.9568658687654407
Training balanced accuracy 0.9690732823930205

Validation roc auc 0.9998971359714065
Training roc auc 0.9999106113045735

Validation precision 1.0
Training precision 1.0

Validation recall 0.9137317375308814
Training recall 0.9381465647860412

Validation f1 0.9547187918412904
Training f1 0.9680664905338991



In [143]:
isolation_forest_model_object.display_test_results()

Test balanced accuracy 0.9556152468662061

Test roc auc 0.9556152468662061

Test precision 1.0

Test recall 0.9112304937324124

Test f1 score 0.9534893621271526


## SVM

In [144]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_true_results, test_size=0.3, random_state=109)
svm_clf = svm.SVC(kernel='linear', class_weight='balanced')
svm_clf.fit(X_train, y_train.values.ravel())
y_pred = svm_clf.predict(X_test)

### Regular data

In [145]:
svm_model_object = Model(svm_clf, train_data, train_true_results.values.ravel(), test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

Validation balanced accuracy 0.5
Training balanced accuracy 1.0

Validation roc auc 0.764034720352588
Training roc auc 1.0

Validation precision 0.0
Training precision 1.0

Validation recall 0.0
Training recall 1.0

Validation f1 0.0
Training f1 1.0



In [146]:
svm_model_object.display_test_results()

Test balanced accuracy 0.5

Test roc auc 0.5

Test precision 0.0

Test recall 0.0

Test f1 score 0.0


### Scaled data

In [147]:
svm_model_object = Model(svm_clf, scaled_train_data, train_true_results.values.ravel(), scaled_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

Validation balanced accuracy 0.6742424242424242
Training balanced accuracy 1.0

Validation roc auc 0.9588356417082972
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.34848484848484845
Training recall 1.0

Validation f1 0.5019607843137255
Training f1 1.0



In [148]:
svm_model_object.display_test_results()

Test balanced accuracy 0.6105994713055342

Test roc auc 0.6105994713055342

Test precision 0.6523809523809524

Test recall 0.22222222222222224

Test f1 score 0.32103386809269163


### Cut by max data

In [149]:
svm_model_object = Model(svm_clf, cut_by_max_train_data, train_true_results.values.ravel(), cut_by_max_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

Validation balanced accuracy 0.5444511751273843
Training balanced accuracy 0.96964461994077

Validation roc auc 0.4991698824374047
Training roc auc 0.9694854571726971

Validation precision 0.0241277384134527
Training precision 0.16294121426912034

Validation recall 0.1691919191919192
Training recall 1.0

Validation f1 0.04218657159833631
Training f1 0.2795178197064989



In [150]:
svm_model_object.display_test_results()

Test balanced accuracy 0.6953440777692504

Test roc auc 0.6953440777692504

Test precision 0.06495364334347385

Test recall 0.4666666666666666

Test f1 score 0.1139547331497796


### Pca reduced + cut by max data

In [151]:
svm_model_object = Model(svm_clf, pca_reduced_train_data, train_true_results.values.ravel(), pca_reduced_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

Validation balanced accuracy 0.5478937948088984
Training balanced accuracy 0.9355051003619611

Validation roc auc 0.39497641768125485
Training roc auc 0.9323510546470759

Validation precision 0.0185936555060751
Training precision 0.08428295055010648

Validation recall 0.255050505050505
Training recall 1.0

Validation f1 0.0345044280094672
Training f1 0.1551574039647551



In [152]:
svm_model_object.display_test_results()


Test balanced accuracy 0.6697450328302209

Test roc auc 0.6697450328302209

Test precision 0.03858712049217183

Test recall 0.48888888888888893

Test f1 score 0.07131542407814785
