In [None]:
from data import Data
from dimension_reduction import PCADimensionReduction
from simple_ml_models import Model

from xgboost import XGBClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [None]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

## XGBoost

In [None]:
# not sure about parameters
# needs to be fixed
xgb_clf = XGBClassifier(booster='gbtree', # tree-based model
                        eta = 0.2,
                        min_child_weight=1,
                        max_depth=5,
                        gamma=0.5
                       )

### Regular data

In [None]:
xgboost_model_object = Model(xgb_clf, train_data, train_true_results, test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [None]:
xgboost_model_object.display_test_results()

### Scaled data

In [None]:
xgboost_model_object = Model(xgb_clf, scaled_train_data, train_true_results, scaled_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [None]:
xgboost_model_object.display_test_results()

### Cut by max data

In [None]:
xgboost_model_object = Model(xgb_clf, cut_by_max_train_data, train_true_results, cut_by_max_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [None]:
xgboost_model_object.display_test_results()

### Pca reduced + cut by max data

In [None]:
xgboost_model_object = Model(xgb_clf, pca_reduced_train_data, train_true_results, pca_reduced_test_data, test_true_results, FOLD_NUMBER)
xgboost_model_object.main_cycle()

In [None]:
xgboost_model_object.display_test_results()

## K-nearest neighbors
KNN provides four search algorithms: 'ball_tree', 'kd_tree', 'brute' and 'auto' (the algorithm attempts to determine the best approach from the training data) and two weight types: 'uniform' (default value) and 'distance' (it generated better results than 'uniform' for above data). KNN did not turn out to be a sufficient model for the problem.

### KNN regular data
Precision and recall for regular data were 0.0 for all algorithm and weight types.

In [None]:
import warnings
warnings.filterwarnings('ignore')

knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  train_data,
                  train_true_results,
                  test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn_model.display_test_results()

### KNN Scaled Data

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  scaled_train_data,
                  train_true_results,
                  scaled_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn_model.display_test_results()

### KNN Cut by max data

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', weights='distance')
knn_model = Model(knn,
                  cut_by_max_train_data,
                  train_true_results,
                  cut_by_max_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn_model.display_test_results()

## KNN PCA reduced + cut by max data
This is the only data where validation precision and recall have values other than zero but still less than 0.5. Therefore, those results are not acceptable.


In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', weights='distance')
knn_model = Model(knn,
                  pca_reduced_train_data,
                  train_true_results,
                  pca_reduced_test_data,
                  test_true_results,
                  FOLD_NUMBER)
knn_model.main_cycle()

In [None]:
knn_model.display_test_results()

## Isolation Forest

In [None]:
isolation_forest = IsolationForest(max_features=1, n_estimators=1000, bootstrap=True, max_samples=1000, n_jobs=12, random_state=SEED)

### Regular data

In [None]:
isolation_forest_model_object = Model(isolation_forest,
                                      train_data,
                                      train_true_results,
                                      test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [None]:
isolation_forest_model_object.display_test_results()

## Scaled data

In [None]:
isolation_forest_model_object = Model(isolation_forest,
                                      scaled_train_data,
                                      train_true_results,
                                      scaled_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [None]:
isolation_forest_model_object.display_test_results()

### Cut by max data

In [None]:
isolation_forest_model_object = Model(isolation_forest,
                                      cut_by_max_train_data,
                                      train_true_results,
                                      cut_by_max_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [None]:
isolation_forest_model_object.display_test_results()

### Pca reduced + cut by max data

In [None]:
isolation_forest_model_object = Model(isolation_forest,
                                      pca_reduced_train_data,
                                      train_true_results,
                                      pca_reduced_test_data,
                                      test_true_results,
                                      FOLD_NUMBER,
                                      minus_one_one_values=True)
isolation_forest_model_object.main_cycle()

In [None]:
isolation_forest_model_object.display_test_results()

## SVM

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_true_results, test_size=0.3, random_state=109)
svm_clf = svm.SVC(kernel='linear', class_weight='balanced')
svm_clf.fit(X_train, y_train.values.ravel())
y_pred = svm_clf.predict(X_test)

### Regular data

In [None]:
svm_model_object = Model(svm_clf, train_data, train_true_results.values.ravel(), test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [None]:
svm_model_object.display_test_results()

### Scaled data

In [None]:
svm_model_object = Model(svm_clf, scaled_train_data, train_true_results.values.ravel(), scaled_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [None]:
svm_model_object.display_test_results()

### Cut by max data

In [None]:
svm_model_object = Model(svm_clf, cut_by_max_train_data, train_true_results.values.ravel(), cut_by_max_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [None]:
svm_model_object.display_test_results()

### Pca reduced + cut by max data

In [None]:
svm_model_object = Model(svm_clf, pca_reduced_train_data, train_true_results.values.ravel(), pca_reduced_test_data, test_true_results, FOLD_NUMBER)
svm_model_object.main_cycle()

In [None]:
svm_model_object.display_test_results()
