In [1]:
from data import Data
from dimension_reduction import PCADimensionReduction
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from imblearn.ensemble import BalancedRandomForestClassifier
from statistics import *
from utils import *
from pandas import *
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
#data_object.generate_train_test_split() #generate train and test split indices' files in the main folder
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [3]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

In [4]:
cancer_cells_indices = np.where(train_true_results.values.ravel() == 1)[0]
healthy_cells_indices = np.where(train_true_results.values.ravel() == 0)[0]

### Regular data

In [5]:
cancer_cells = train_data.iloc[cancer_cells_indices]
healthy_cells = train_data.iloc[healthy_cells_indices]

In [6]:
variance = variance_between(cancer_cells, healthy_cells)

variance

CXCL8     0.000302
VIM       0.000029
LCN2      0.000657
IL1B      0.000004
S100A4    0.000156
            ...   
STC1      0.000002
MFAP3L    0.000007
RPL27A    0.001653
MACF1     0.000034
SPC24     0.000024
Length: 2000, dtype: float64

In [7]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))
print("number of features with variance higher than mean variance: ", len(variance[variance > variance.mean()]))

mean variance:  9.62591022402786e-05
max variance:  0.005935205926522863
number of features with variance higher than mean variance:  443


In [8]:
variance.sort_values(ascending=False).head(50)

MALAT1      0.005935
NME2        0.002186
MKI67       0.002081
TUBA1C      0.001905
MT2A        0.001792
RPL27A      0.001653
YWHAE       0.001552
MYOF        0.001538
C15orf48    0.001536
HNRNPAB     0.001479
HCFC1R1     0.001404
CENPF       0.001395
TOP2A       0.001369
HIST1H1C    0.001298
PLAT        0.001230
UBE2C       0.001217
ANXA4       0.001177
KRT18       0.001159
UGCG        0.001137
RPS23       0.001126
SLC25A37    0.001101
CKAP2       0.001097
LIMS1       0.001038
HIST1H4C    0.001031
TLN1        0.001017
ADI1        0.001007
GNAI2       0.000988
CALB1       0.000980
AKAP12      0.000973
PCNA        0.000951
MT1E        0.000947
OSTC        0.000943
TGFBI       0.000943
MCM3        0.000931
HMGN2       0.000919
MARCKS      0.000915
SPDL1       0.000869
ACAT1       0.000858
LMO4        0.000823
VMA21       0.000790
KIF1C       0.000788
C12orf75    0.000785
CXCL1       0.000775
TARS        0.000760
PLIN3       0.000759
SERPINA1    0.000754
FNDC3B      0.000747
TPX2        0

### Scaled data

In [9]:
cancer_cells = scaled_train_data.iloc[cancer_cells_indices]
healthy_cells = scaled_train_data.iloc[healthy_cells_indices]

In [10]:
variance = variance_between(cancer_cells, healthy_cells)

variance

CXCL8     0.000425
VIM       0.000024
LCN2      0.043719
IL1B      0.000007
S100A4    0.000078
            ...   
STC1      0.005390
MFAP3L    0.003950
RPL27A    0.008797
MACF1     0.000098
SPC24     0.006763
Length: 2000, dtype: float64

In [11]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))
print("number of features with variance higher than mean variance: ", len(variance[variance > variance.mean()]))

mean variance:  0.006867163227153484
max variance:  0.5500559459835836
number of features with variance higher than mean variance:  246


In [12]:
variance.sort_values(ascending=False).head(50)

OCLN         0.550056
CDC25C       0.527067
SH2D4A       0.507673
PRSS12       0.277217
CSF2         0.248620
PRR36        0.243397
CCDC68       0.241797
SLC24A3      0.217707
LINC01191    0.215271
DTL          0.193435
CSF3         0.193119
MIR100HG     0.182599
HOXB-AS3     0.182271
CDCP1        0.179400
THSD7A       0.179204
STOX2        0.165614
DDIT4L       0.161666
VEPH1        0.160810
SAA1         0.157522
PDLIM4       0.146138
TM4SF19      0.144832
GALNT16      0.134867
HEY1         0.131291
SPANXB1      0.127953
TCF7L1       0.125692
NEURL3       0.121615
PLAT         0.118786
CACNG8       0.117637
UBE2C        0.115166
CALB1        0.107055
UCHL1        0.105567
LIF          0.105383
ARHGAP23     0.101730
CXCL6        0.101544
CENPA        0.100989
CKAP2L       0.098388
SPINK6       0.096770
COL4A1       0.095130
MT1M         0.093393
MKI67        0.092062
ESCO2        0.089654
ANGPTL4      0.084612
RAB42        0.081899
AKAP12       0.080339
TOP2A        0.079634
CASC19    

### Cut by max data

In [13]:
cancer_cells = cut_by_max_train_data.iloc[cancer_cells_indices]
healthy_cells = cut_by_max_train_data.iloc[healthy_cells_indices]

In [14]:
variance = variance_between(cancer_cells, healthy_cells)

variance

CXCL8     0.000302
VIM       0.000029
S100A4    0.000156
LTB       0.000161
ISG15     0.000003
            ...   
DUSP2     0.000135
TMSB4X    0.000014
RPS29     0.000041
ANXA1     0.000183
RPL27A    0.001653
Length: 66, dtype: float64

In [15]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))
print("number of features with variance higher than mean variance: ", len(variance[variance > variance.mean()]))

mean variance:  0.00024161382140899213
max variance:  0.005935205926522863
number of features with variance higher than mean variance:  12


In [16]:
variance.sort_values(ascending=False).head(50)

MALAT1       0.005935
RPL27A       0.001653
RPS23        0.001126
SRGN         0.000507
SAT1         0.000489
GNG11        0.000431
MT-ND1       0.000411
RPLP1        0.000407
FTL          0.000379
IER2         0.000372
IL32         0.000336
CXCL8        0.000302
GAPDH        0.000208
MT-ND2       0.000186
ANXA1        0.000183
MT-ATP6      0.000183
ODC1         0.000163
LTB          0.000161
S100A4       0.000156
PFN1         0.000152
DUSP2        0.000135
MT-ND4       0.000130
RNASET2      0.000129
NFKBIA       0.000128
RPS24        0.000123
ID2          0.000114
CD74         0.000111
PSAP         0.000099
RPS11        0.000096
RPS18        0.000092
FOS          0.000084
RPL37        0.000075
HMGB2        0.000071
IFI6         0.000070
RPS6         0.000063
HES1         0.000060
ACTG1        0.000054
IFITM3       0.000051
HLA-DQB1     0.000047
HIST1H2AC    0.000046
NEAT1        0.000045
CTSS         0.000045
RPS29        0.000041
JUNB         0.000041
COTL1        0.000040
LST1      

### PCA reduced + cut by max data

In [17]:
cancer_cells = pca_reduced_train_data.iloc[cancer_cells_indices]
healthy_cells = pca_reduced_train_data.iloc[healthy_cells_indices]

In [18]:
variance = variance_between(cancer_cells, healthy_cells)

variance

S100A9      3.625990e-06
CST3        6.132040e-06
CTSS        4.507416e-05
LST1        3.035642e-05
LGALS1      1.318114e-05
S100A4      1.558836e-04
S100A6      1.281168e-06
FTL         3.786798e-04
NEAT1       4.514401e-05
SAT1        4.891578e-04
PSAP        9.918821e-05
SRGN        5.073070e-04
LTB         1.612938e-04
VCAN        5.546489e-08
COTL1       3.957597e-05
FTH1        8.423000e-06
IL32        3.360936e-04
CYBA        8.545211e-06
NFKBIA      1.280464e-04
CD74        1.105384e-04
HLA-DQB1    4.740188e-05
VIM         2.937939e-05
GAPDH       2.082741e-04
ANXA1       1.834821e-04
FOS         8.377908e-05
CXCL8       3.019382e-04
RPS29       4.141385e-05
ACTB        1.552720e-05
KLF6        2.327807e-05
RPS6        6.253874e-05
MT-ND1      4.111781e-04
MALAT1      5.935206e-03
MT-ATP6     1.834002e-04
IFITM3      5.056730e-05
MT-ND4      1.299877e-04
ISG20       1.365868e-06
RPS18       9.166749e-05
RPS23       1.125777e-03
RNASET2     1.286398e-04
RPL27A      1.653089e-03


In [19]:
print("mean variance: ", variance.mean())
print("max variance: ", max(variance))
print("number of features with variance higher than mean variance: ", len(variance[variance > variance.mean()]))

mean variance:  0.00025256332041698534
max variance:  0.005935205926522863
number of features with variance higher than mean variance:  11


In [20]:
variance.sort_values(ascending=False).head(50)

MALAT1      0.005935
RPL27A      0.001653
RPS23       0.001126
SRGN        0.000507
SAT1        0.000489
MT-ND1      0.000411
RPLP1       0.000407
FTL         0.000379
IER2        0.000372
IL32        0.000336
CXCL8       0.000302
GAPDH       0.000208
MT-ND2      0.000186
ANXA1       0.000183
MT-ATP6     0.000183
LTB         0.000161
S100A4      0.000156
PFN1        0.000152
DUSP2       0.000135
MT-ND4      0.000130
RNASET2     0.000129
NFKBIA      0.000128
RPS24       0.000123
ID2         0.000114
CD74        0.000111
PSAP        0.000099
RPS18       0.000092
FOS         0.000084
RPL37       0.000075
HMGB2       0.000071
IFI6        0.000070
RPS6        0.000063
HES1        0.000060
IFITM3      0.000051
HLA-DQB1    0.000047
NEAT1       0.000045
CTSS        0.000045
RPS29       0.000041
JUNB        0.000041
COTL1       0.000040
LST1        0.000030
VIM         0.000029
HLA-B       0.000029
KLF6        0.000023
ACTB        0.000016
JUN         0.000015
TMSB4X      0.000014
LGALS1      0