In [2]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import Model

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier 

In [3]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

FileNotFoundError: [Errno 2] No such file or directory: 'data/SC_integration/counts_ctc_simulated_123_5k.tsv'

In [None]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

## XGBoost

In [4]:
# not sure about parameters
# needs to be fixed
xgb_clf = XGBClassifier(booster='gbtree', # tree-based model
                        eta = 0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5
                       )

### Regular data

In [5]:
xgboost_model_object = Model(xgb_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0



In [6]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Scaled data

In [7]:
xgboost_model_object = Model(xgb_clf, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.8434343434343434
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.6868686868686869
Training recall 1.0

Validation f1 0.81203007518797
Training f1 1.0



In [8]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9222222222222222

Test roc auc 0.9222222222222222

Test precision 1.0

Test recall 0.8444444444444444

Test f1 score 0.9116809116809117


### Cut by max data

In [9]:
xgboost_model_object = Model(xgb_clf, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.9583333333333334
Training balanced accuracy 1.0

Validation roc auc 1.0
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9166666666666666
Training recall 1.0

Validation f1 0.9552042160737813
Training f1 1.0



In [10]:
xgboost_model_object.display_test_results()

Test balanced accuracy 0.9777777777777779

Test roc auc 0.9777777777777779

Test precision 1.0

Test recall 0.9555555555555556

Test f1 score 0.9770114942528737


### Pca reduced + cut by max data

In [11]:
xgboost_model_object = Model(xgb_clf, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

Validation balanced accuracy 0.970959595959596
Training balanced accuracy 1.0

Validation roc auc 0.9999725786991335
Training roc auc 1.0

Validation precision 1.0
Training precision 1.0

Validation recall 0.9419191919191919
Training recall 1.0

Validation f1 0.9696342305037957
Training f1 1.0



In [12]:
xgboost_model_object.display_test_results()

Test balanced accuracy 1.0

Test roc auc 1.0

Test precision 1.0

Test recall 1.0

Test f1 score 1.0


# K-nearest neighbors

KNN provides four search algorithms: 'ball_tree', 'kd_tree', 'brute' and 'auto' (the algorithm attempts to determine the best approach from the training data) and two weight types: 'uniform' (default value) and 'distance' (it generated better results than 'uniform' for above data). KNN did not turn out to be a sufficient model for the problem. 

### KNN regular data

Precision and recall for regular data were 0.0 for all algorithm and weight types. 

In [None]:
knn = KNeighborsClassifier(n_neighbors=100, algorithm='kd_tree', weights='distance')
knn_model = Model(knn, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

### KNN Scaled data

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto', weights='distance')
knn_model = Model(knn, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance')
knn_model = Model(knn, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='brute', weights='distance')
knn_model = Model(knn, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

### KNN Cut by max data

In [None]:
knn = KNeighborsClassifier(n_neighbors=100, algorithm='ball_tree', weights='distance')
knn_model = Model(knn, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance')
knn_model = Model(knn, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

### KNN PCA reduced + cut by max data

This is the only data where precision and recall have values other than zero but still less than 0.5. Therefore, those results are not acceptable. 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance')
knn_model = Model(knn, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

In [4]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='brute', weights='distance')
knn_model = Model(knn, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
knn_model.main_cycle()

NameError: name 'pca_reduced_train_data' is not defined