In [542]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import Model

from xgboost import XGBClassifier

from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [543]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [544]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

## XGBoost

In [545]:
# not sure about parameters
# needs to be fixed
xgb_clf = XGBClassifier(booster='gbtree', # tree-based model
                        eta = 0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5
                       )

### Regular data

In [546]:
xgboost_model_object = Model(xgb_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0



In [547]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Scaled data

In [548]:
xgboost_model_object = Model(xgb_clf, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0



In [549]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Cut by max data

In [550]:
xgboost_model_object = Model(xgb_clf, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.9583333333333334
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9166666666666666
Training recall 1.0

Validation f1 0.9552042160737813
Training f1 1.0



In [551]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9777777777777779

Test roc auc 0.9777777777777779

Test precision 1.0

Test recall 0.9555555555555556

Test f1 score 0.9770114942528737


### Pca reduced + cut by max data

In [552]:
xgboost_model_object = Model(xgb_clf, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.970959595959596
Training balanced accuracy 1.0

Validation roc auc 0.9999725786991335
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9419191919191919
Training recall 1.0

Validation f1 0.9696342305037957
Training f1 1.0



In [553]:
xgboost_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


## SVM

In [554]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_true_results, test_size=0.3,random_state=109)
svm_clf = svm.SVC(kernel='linear')
svm_clf.fit(X_train, y_train.values.ravel())
y_pred = svm_clf.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

### Regular data

In [555]:
svm_model_object = Model(svm_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))


Validation balanced accuracy 0.5
Training balanced accuracy 1.0

Validation roc auc 0.764034720352588
Training roc auc 1.0

Validation precision 0.0
Training precision 1.0

Validation recall 0.0
Training recall 1.0

Validation f1 0.0
Training f1 1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [556]:
svm_model_object.display_test_results()

Test balanced accuracy 0.5

Test roc auc 0.5

Test precision 0.0

Test recall 0.0

Test f1 score 0.0


### Scaled data

In [557]:
svm_model_object = Model(svm_clf, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Validation balanced accuracy 0.6742424242424242
Training balanced accuracy 1.0

Validation roc auc 0.9588356417082972
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.34848484848484845
Training recall 1.0

Validation f1 0.5019607843137255
Training f1 1.0



In [558]:
svm_model_object.display_test_results()

Test balanced accuracy 0.6105994713055342

Test roc auc 0.6105994713055342

Test precision 0.6523809523809524

Test recall 0.22222222222222224

Test f1 score 0.32103386809269163


### Cut by max data

In [560]:
svm_model_object = Model(svm_clf, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Validation balanced accuracy 0.5845959595959596
Training balanced accuracy 0.7134661835748792

Validation roc auc 0.5554134633601563
Training roc auc 0.8250106704627284

Validation precision 1.0
Training precision 1.0

Validation recall 0.1691919191919192
Training recall 0.42693236714975846

Validation f1 0.2735042735042735
Training f1 0.5818532818532818



In [561]:
svm_model_object.display_test_results()

Test balanced accuracy 0.6328216935277564

Test roc auc 0.6328216935277564

Test precision 0.75

Test recall 0.26666666666666666

Test f1 score 0.3909774436090226


### Pca reduced + cut by max data

In [563]:
svm_model_object = Model(svm_clf, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Validation balanced accuracy 0.5858585858585857
Training balanced accuracy 0.6422101449275363

Validation roc auc 0.47476991035727467
Training roc auc 0.7321493292511362

Validation precision 1.0
Training precision 1.0

Validation recall 0.1717171717171717
Training recall 0.28442028985507245

Validation f1 0.2871794871794872
Training f1 0.43879683534855946



In [564]:
svm_model_object.display_test_results()


Test balanced accuracy 0.6555555555555556

Test roc auc 0.6555555555555556

Test precision 1.0

Test recall 0.3111111111111111

Test f1 score 0.4635832004253057
