In [1]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import Model

from sklearn.dummy import DummyClassifier

In [2]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 2
PCA_VARIABLES_AMOUNT = 100

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [3]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

## Dummy Model

In [4]:
# Example dummy model
dummy_clf = DummyClassifier(strategy="stratified")
dummy_model_object = Model(dummy_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
dummy_model_object.main_cycle()

# For model optimalisation we should look mostly at validation results and at training also

Validation balanced accuracy 0.49555774925962487
Training balanced accuracy 0.5156787248856651

Validation roc auc 0.5087349307487511
Training roc auc 0.5022282787530222

Validation precision 0.0
Training precision 0.04237144585601935

Validation recall 0.0
Training recall 0.0428743961352657

Validation f1 0.0
Training f1 0.04234052111410602



In [5]:
# We should NOT look at test results during model optimalisation. Model should be tested after optimalisation
dummy_model_object.display_test_results()

Test balanced accuracy 0.5164662744094825

Test roc auc 0.5164662744094823

Test precision 0.04305555555555555

Test recall 0.044444444444444446

Test f1 score 0.04372759856630825
