In [28]:
from data import Data
from dimension_reduction import PCADimensionReduction
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from imblearn.ensemble import BalancedRandomForestClassifier
from statistics import *
from utils import *
from pandas import *
import numpy as np
import matplotlib.pyplot as plt

In [29]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
#data_object.generate_train_test_split() #generate train and test split indices' files in the main folder
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [30]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

In [31]:
cancer_cells_indices = np.where(train_true_results.values.ravel() == 1)[0]
healthy_cells_indices = np.where(train_true_results.values.ravel() == 0)[0]

### Regular data

In [32]:
cancer_cells = train_data.iloc[cancer_cells_indices]
healthy_cells = train_data.iloc[healthy_cells_indices]

In [33]:
variance = variance_between(cancer_cells, healthy_cells)

variance

CXCL8     0.000302
VIM       0.000029
LCN2      0.000657
IL1B      0.000004
S100A4    0.000156
            ...   
STC1      0.000002
MFAP3L    0.000007
RPL27A    0.001653
MACF1     0.000034
SPC24     0.000024
Length: 2000, dtype: float64

In [34]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))

mean variance:  9.62591022402786e-05
max variance:  0.005935205926522863


### Scaled data

In [35]:
cancer_cells = scaled_train_data.iloc[cancer_cells_indices]
healthy_cells = scaled_train_data.iloc[healthy_cells_indices]

In [36]:
variance = variance_between(cancer_cells, healthy_cells)

variance

CXCL8     0.000425
VIM       0.000024
LCN2      0.043719
IL1B      0.000007
S100A4    0.000078
            ...   
STC1      0.005390
MFAP3L    0.003950
RPL27A    0.008797
MACF1     0.000098
SPC24     0.006763
Length: 2000, dtype: float64

In [37]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))

mean variance:  0.006867163227153484
max variance:  0.5500559459835836


### Cut by max data

In [38]:
cancer_cells = cut_by_max_train_data.iloc[cancer_cells_indices]
healthy_cells = cut_by_max_train_data.iloc[healthy_cells_indices]

In [39]:
variance = variance_between(cancer_cells, healthy_cells)

variance

CXCL8     0.000302
VIM       0.000029
S100A4    0.000156
LTB       0.000161
ISG15     0.000003
            ...   
DUSP2     0.000135
TMSB4X    0.000014
RPS29     0.000041
ANXA1     0.000183
RPL27A    0.001653
Length: 66, dtype: float64

In [40]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))

mean variance:  0.00024161382140899213
max variance:  0.005935205926522863


### PCA reduced + cut by max data

In [41]:
cancer_cells = pca_reduced_train_data.iloc[cancer_cells_indices]
healthy_cells = pca_reduced_train_data.iloc[healthy_cells_indices]

In [42]:
variance = variance_between(cancer_cells, healthy_cells)

variance

S100A9      3.625990e-06
CST3        6.132040e-06
CTSS        4.507416e-05
LST1        3.035642e-05
LGALS1      1.318114e-05
S100A4      1.558836e-04
S100A6      1.281168e-06
FTL         3.786798e-04
NEAT1       4.514401e-05
SAT1        4.891578e-04
PSAP        9.918821e-05
SRGN        5.073070e-04
LTB         1.612938e-04
VCAN        5.546489e-08
COTL1       3.957597e-05
FTH1        8.423000e-06
IL32        3.360936e-04
CYBA        8.545211e-06
NFKBIA      1.280464e-04
CD74        1.105384e-04
HLA-DQB1    4.740188e-05
VIM         2.937939e-05
GAPDH       2.082741e-04
ANXA1       1.834821e-04
FOS         8.377908e-05
CXCL8       3.019382e-04
RPS29       4.141385e-05
ACTB        1.552720e-05
KLF6        2.327807e-05
RPS6        6.253874e-05
MT-ND1      4.111781e-04
MALAT1      5.935206e-03
MT-ATP6     1.834002e-04
IFITM3      5.056730e-05
MT-ND4      1.299877e-04
ISG20       1.365868e-06
RPS18       9.166749e-05
RPS23       1.125777e-03
RNASET2     1.286398e-04
RPL27A      1.653089e-03


In [43]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))

mean variance:  0.00025256332041698534
max variance:  0.005935205926522863
