In [37]:
from data import Data
from dimension_reduction import PCADimensionReduction
from utils import display_autoencoder_metrics

from keras.models import Sequential
from keras.layers import Dense

In [38]:
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
healthy_train_indices_filepath = "data/SC_integration/healthy_train_indices.npy"
healthy_test_indices_filepath = "data/SC_integration/healthy_test_indices.npy"

SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

In [39]:
data_object = Data(data_filepath, true_results_filepath)

healthy_train_data, healthy_test_data, cancer_data =  data_object.get_train_test_healthy_split_cancer_data(healthy_train_indices_filepath, healthy_test_indices_filepath)
scaled_healthy_train_data, scaled_healthy_test_data, scaled_cancer_data = data_object.get_scaled_healthy_train_test_cancer_data()

In [40]:
cut_by_max_healthy_train_data, cut_by_max_healthy_test_data, cut_by_max_cancer_data = data_object.get_cut_by_max_healthy_train_test_cancer_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_healthy_train_data, scaled_healthy_train_data, data_object.train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_healthy_train_data = cut_by_max_healthy_train_data[pca_variables.index]
pca_reduced_healthy_test_data = cut_by_max_healthy_test_data[pca_variables.index]
pca_reduced_cancer_data = cut_by_max_cancer_data[pca_variables.index]

In [41]:
def create_autoencoder(input_shape):
    model = Sequential()
    model.add(Dense(1000, input_dim=input_shape, activation='relu'))
    model.add(Dense(500, activation='relu')) # size to compress to
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(input_shape))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

## Autoencoder

### Regular data

In [42]:
autoencoder = create_autoencoder(healthy_train_data.shape[1])
autoencoder.fit(healthy_train_data, healthy_train_data, batch_size=16, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb3ec261700>

In [43]:
display_autoencoder_metrics(autoencoder, healthy_train_data, healthy_test_data, cancer_data)

Train RMSE: 0.2469018769592623
Test RMSE: 0.34020545621509285
Cancer data RMSE: 0.5450958578847498


### Scaled data

In [44]:
autoencoder = create_autoencoder(scaled_healthy_train_data.shape[1])
autoencoder.fit(scaled_healthy_train_data, scaled_healthy_train_data, batch_size=16, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb3b939fa30>

In [45]:
display_autoencoder_metrics(autoencoder, scaled_healthy_train_data, scaled_healthy_test_data, scaled_cancer_data)

Train RMSE: 0.7125836506939516
Test RMSE: 0.9494251561971796
Cancer data RMSE: 6.559844003766266


### Cut by max data

In [46]:
autoencoder = create_autoencoder(cut_by_max_healthy_train_data.shape[1])
autoencoder.fit(cut_by_max_healthy_train_data, cut_by_max_healthy_train_data, batch_size=16, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb39c735dc0>

In [47]:
display_autoencoder_metrics(autoencoder, cut_by_max_healthy_train_data, cut_by_max_healthy_test_data, cut_by_max_cancer_data)

Train RMSE: 0.05394235689897675
Test RMSE: 0.0695097391855808
Cancer data RMSE: 0.17538660760323424


### PCA reduced

In [48]:
autoencoder = create_autoencoder(pca_reduced_healthy_train_data.shape[1])
autoencoder.fit(pca_reduced_healthy_train_data, pca_reduced_healthy_train_data, batch_size=16, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb39c2ea910>

In [49]:
display_autoencoder_metrics(autoencoder, pca_reduced_healthy_train_data, pca_reduced_healthy_test_data, pca_reduced_cancer_data)

Train RMSE: 0.08836637308646005
Test RMSE: 0.10162575061382327
Cancer data RMSE: 0.1989458956637712
