In [2]:
from data import Data
from dimension_reduction import PCADimensionReduction
from utils import *

In [10]:
# You can test the other datasets by changing these file names
data_filepath = "data/SC_integration/counts_ctc_simulated_123_5k.tsv"
true_results_filepath = "data/SC_integration/ids_ctc_simulated_123_5k.tsv"
train_indices_filepath = "data/SC_integration/train_indices.npy"
test_indices_filepath = "data/SC_integration/test_indices.npy"
SEED = 42
FOLD_NUMBER = 3

# You can change these values to work better for models
CUT_BY_MAX_THRESHOLD = 4
PCA_VARIABLES_AMOUNT = 60

# There are 4 data variants to check: regular, scaled, cut by max, pca reduced + cut by max
data_object = Data(data_filepath, true_results_filepath)
train_data, test_data, train_true_results, test_true_results =  data_object.load_train_test_split(train_indices_filepath, test_indices_filepath)
scaled_train_data, scaled_test_data = data_object.get_scaled_train_test_data()

In [11]:
cut_by_max_train_data, cut_by_max_test_data = data_object.get_cut_by_max_train_test_data(CUT_BY_MAX_THRESHOLD)

pca_object = PCADimensionReduction(cut_by_max_train_data, scaled_train_data, train_true_results, SEED)
pca_variables = pca_object.get_most_important_variables_from_pc1(PCA_VARIABLES_AMOUNT)
pca_reduced_train_data = cut_by_max_train_data[pca_variables.index]
pca_reduced_test_data = cut_by_max_test_data[pca_variables.index]

## Calculate statistics

### Regular data

In [12]:
statistics = calculate_statistics(train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
statistics

Unnamed: 0,ctc_variance,other_cells_variance,ctc_mean,other_cells_mean,ctc_std,other_cells_std,variance_check,t_test,p_values,information_gain
LYPD1,0.001640,0.000492,-0.041227,0.000403,0.040492,0.022191,True,10.887897,4.091244e-27,0.034543
CTSV,0.011708,0.003820,-0.109960,0.003860,0.108205,0.061803,True,10.704494,2.801127e-26,0.051825
PLXNB1,0.002936,0.000773,-0.048202,0.000505,0.054181,0.027810,True,10.138703,8.769759e-24,0.033745
CLGN,0.003087,0.001039,-0.049504,0.000966,0.055561,0.032228,True,9.107243,1.484111e-19,0.027640
RBP1,0.001212,0.000422,-0.027049,0.000373,0.034817,0.020541,True,7.767840,1.079962e-14,0.033527
...,...,...,...,...,...,...,...,...,...,...
RPL27A,1.185831,0.174752,3.588467,3.971691,1.088959,0.418033,False,2.050326,4.808014e-02,0.008283
RNF144B,0.222263,0.122330,0.207723,0.089673,0.471448,0.349758,True,1.975582,4.829148e-02,0.048109
KLF9,0.004032,0.126510,0.073149,0.098617,0.063494,0.355683,False,2.012080,4.856245e-02,0.057874
KIF3B,0.085274,0.045390,-0.036723,0.035041,0.292017,0.213049,True,1.970936,4.882088e-02,0.052465


### Scaled data

In [13]:
statistics = calculate_statistics(scaled_train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
statistics

  t_statistics = np.where(statistics["variance_check"], stats.ttest_ind(healthy_cells_data, cancer_cells_data, equal_var = True).statistic, stats.ttest_ind(healthy_cells_data, cancer_cells_data, equal_var = False).statistic)
  p_values = np.where(statistics["variance_check"], stats.ttest_ind(healthy_cells_data, cancer_cells_data, equal_var = True).pvalue, stats.ttest_ind(healthy_cells_data, cancer_cells_data, equal_var = False).pvalue)


Unnamed: 0,ctc_variance,other_cells_variance,ctc_mean,other_cells_mean,ctc_std,other_cells_std,variance_check,t_test,p_values,information_gain
LYPD1,3.122861,0.937968,-1.796151,0.020686,1.767162,0.968487,True,10.887897,4.091244e-27,0.034543
CTSV,2.887166,0.941893,-1.766992,0.020350,1.699166,0.970511,True,10.704494,2.801127e-26,0.051825
PLXNB1,3.559378,0.937773,-1.676702,0.019310,1.886631,0.968387,True,10.138703,8.769759e-24,0.033745
CLGN,2.830516,0.952326,-1.510854,0.017400,1.682414,0.975872,True,9.107243,1.484111e-19,0.027640
RBP1,2.758942,0.960257,-1.293294,0.014895,1.661006,0.979927,True,7.767840,1.079962e-14,0.033527
...,...,...,...,...,...,...,...,...,...,...
RPL27A,6.310406,0.929942,-0.873970,0.010065,2.512052,0.964335,False,2.050326,4.808014e-02,0.008283
RNF144B,1.797880,0.989527,0.331925,-0.003823,1.340851,0.994750,True,1.975582,4.829148e-02,0.048272
KLF9,0.032221,1.011087,-0.071180,0.000820,0.179501,1.005528,False,2.012080,4.856245e-02,0.057874
KIF3B,1.857737,0.988844,-0.331145,0.003814,1.362988,0.994406,True,1.970936,4.882088e-02,0.052465


### Cut by max data

In [14]:
statistics = calculate_statistics(cut_by_max_train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
statistics

Unnamed: 0,ctc_variance,other_cells_variance,ctc_mean,other_cells_mean,ctc_std,other_cells_std,variance_check,t_test,p_values,information_gain
HIST1H2AC,0.006935,0.113099,0.006855,0.071128,0.083277,0.336302,False,4.138448,0.000142,0.050214
DUSP2,0.022698,0.826206,0.719691,0.610023,0.150659,0.908959,False,3.577834,0.000648,0.042063
RPLP1,0.455448,0.153533,4.264101,4.45417,0.674869,0.391833,True,2.821037,0.004818,0.0
MALAT1,2.402051,0.226634,5.587235,6.313378,1.549855,0.476061,False,2.730496,0.00994,0.007915
IRF7,0.003437,0.257767,0.149566,0.179275,0.058626,0.507707,False,2.178763,0.031405,0.06244
ID2,0.060887,0.945616,0.68271,0.783333,0.246752,0.972428,False,2.194713,0.033179,0.041578
RPL27A,1.185831,0.174752,3.588467,3.971691,1.088959,0.418033,False,2.050326,0.04808,0.008283


### Pca reduced + cut by max data

In [15]:
statistics = calculate_statistics(pca_reduced_train_data, train_true_results.values.ravel())
statistics = statistics[statistics["p_values"] < 0.05]
statistics

Unnamed: 0,ctc_variance,other_cells_variance,ctc_mean,other_cells_mean,ctc_std,other_cells_std,variance_check,t_test,p_values,information_gain
DUSP2,0.022698,0.826206,0.719691,0.610023,0.150659,0.908959,False,3.577834,0.000648,0.042063
RPLP1,0.455448,0.153533,4.264101,4.45417,0.674869,0.391833,True,2.821037,0.004818,0.0
MALAT1,2.402051,0.226634,5.587235,6.313378,1.549855,0.476061,False,2.730496,0.00994,0.007915
IRF7,0.003437,0.257767,0.149566,0.179275,0.058626,0.507707,False,2.178763,0.031405,0.06244
ID2,0.060887,0.945616,0.68271,0.783333,0.246752,0.972428,False,2.194713,0.033179,0.041578
RPL27A,1.185831,0.174752,3.588467,3.971691,1.088959,0.418033,False,2.050326,0.04808,0.008283


## Logistic regression

In [16]:
logistic_regression_features = pd.read_csv("features/logistic_regression.csv", index_col=0)
logistic_regression_features = logistic_regression_features[(logistic_regression_features != 0).all(1)]
logistic_regression_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
PLAT,1.147693
CENPF,1.006735
ESCO2,0.837701
MKI67,0.637532
SPDL1,0.627327
C15orf48,0.616605
SAA1,0.517257
KRT18,0.49419
HCFC1R1,0.432873
CDKN1C,0.380458


## PCA

In [17]:
pca_features = pd.read_csv("features/pca.csv", index_col=0)
pca_features = pca_features[(pca_features != 0).all(1)]
pca_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
S100A9,2.397653e-01
CST3,2.179769e-01
CTSS,1.873466e-01
LST1,1.803768e-01
LGALS1,1.777827e-01
...,...
NEURL3,1.481562e-06
TMEM178B,1.138576e-06
CACNG8,1.076366e-06
GRID1,5.523284e-07


## Shap

### Random forest regular data

In [18]:
shap_random_forest_regular_data_features = pd.read_csv("features/shap_random_forest_regular_data.csv", index_col=0)
shap_random_forest_regular_data_features = shap_random_forest_regular_data_features[(shap_random_forest_regular_data_features != 0).all(1)]
shap_random_forest_regular_data_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,5.006528e-04
BIK,3.646569e-04
MCM8,3.257085e-04
BAIAP2,3.186418e-04
SHCBP1,3.045842e-04
...,...
RHOB,4.050730e-07
PADI2,4.049146e-07
TMEM125,3.244357e-07
HIST3H2A,3.244111e-07


### Random forest scaled data

In [19]:
shap_random_forest_scaled_data_features = pd.read_csv("features/shap_random_forest_scaled_data.csv", index_col=0)
shap_random_forest_scaled_data_features = shap_random_forest_scaled_data_features[(shap_random_forest_scaled_data_features != 0).all(1)]
shap_random_forest_scaled_data_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,5.006528e-04
BIK,3.646569e-04
MCM8,3.257085e-04
BAIAP2,3.186418e-04
SHCBP1,3.045842e-04
...,...
RHOB,4.050730e-07
PADI2,4.049146e-07
TMEM125,3.244357e-07
HIST3H2A,3.244111e-07


### Balanced random forest regular data

In [20]:
shap_balanced_random_forest_regular_data_features = pd.read_csv("features/shap_balanced_random_forest_regular_data.csv", index_col=0)
shap_balanced_random_forest_regular_data_features = shap_balanced_random_forest_regular_data_features[(shap_balanced_random_forest_regular_data_features != 0).all(1)]
shap_balanced_random_forest_regular_data_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,0.011207
GATM,0.007665
CD38,0.006735
NINJ2,0.006714
MCM8,0.006042
...,...
FOSL2,0.000005
PLEKHF1,0.000005
DRD4,0.000005
NAGA,0.000005


### Balanced random forest scaled data

In [21]:
shap_balanced_random_forest_scaled_data_features = pd.read_csv("features/shap_balanced_random_forest_scaled_data.csv", index_col=0)
shap_balanced_random_forest_scaled_data_features = shap_balanced_random_forest_scaled_data_features[(shap_balanced_random_forest_scaled_data_features != 0).all(1)]
shap_balanced_random_forest_scaled_data_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,0.011207
GATM,0.007665
CD38,0.006735
NINJ2,0.006714
MCM8,0.006042
...,...
FOSL2,0.000005
PLEKHF1,0.000005
DRD4,0.000005
NAGA,0.000005


## XGBoost

In [22]:
pca_reduced_xgboost_features = pd.read_csv("features/pca_reduced_xgboost.csv", index_col=0)
pca_reduced_xgboost_features = pca_reduced_xgboost_features[(pca_reduced_xgboost_features != 0).all(1)]
pca_reduced_xgboost_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
HES1,0.248787
IFITM3,0.237116
CMC1,0.097767
CTSS,0.080122
PSAP,0.071914
HLA-DQB1,0.060904
IFI6,0.058478
IRF7,0.057819
TMSB4X,0.057274
DUSP2,0.017442


## LightGBM

In [23]:
pca_reduced_lightgbm_features = pd.read_csv("features/pca_reduced_lightgbm.csv", index_col=0)
pca_reduced_lightgbm_features = pca_reduced_lightgbm_features[(pca_reduced_lightgbm_features != 0).all(1)]
pca_reduced_lightgbm_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
IRF7,226
S100A9,165
CMC1,157
HES1,132
PSAP,77
FOS,69
CXCL8,63
ACTB,42
RPS23,42
CST3,38


## Random forest

### Regular data

In [24]:
random_forest_regular_features = pd.read_csv("features/regular_random_forest.csv", index_col=0)
random_forest_regular_features = random_forest_regular_features[(random_forest_regular_features != 0).all(1)]
random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,0.022015
BIK,0.017004
MCM8,0.015523
DTL,0.015121
BAIAP2,0.014849
...,...
KIF1C,0.000020
TBC1D12,0.000020
HSPA1A,0.000019
KRT18,0.000018


### Scaled data

In [25]:
random_forest_scaled_features = pd.read_csv("features/scaled_random_forest.csv", index_col=0)
random_forest_scaled_features = random_forest_scaled_features[(random_forest_scaled_features != 0).all(1)]
random_forest_scaled_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,0.022015
BIK,0.017004
MCM8,0.015523
DTL,0.015121
BAIAP2,0.014849
...,...
KIF1C,0.000020
TBC1D12,0.000020
HSPA1A,0.000019
KRT18,0.000018


## Balanced random forest

### Regular data

In [26]:
balanced_random_forest_regular_features = pd.read_csv("features/regular_balanced_random_forest.csv", index_col=0)
balanced_random_forest_regular_features = balanced_random_forest_regular_features[(balanced_random_forest_regular_features != 0).all(1)]
balanced_random_forest_regular_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,0.022168
GATM,0.015636
CD38,0.013944
NINJ2,0.013250
MCM8,0.011892
...,...
TFRC,0.000016
ACVRL1,0.000015
FOXJ1,0.000014
ISYNA1,0.000014


### Scaled data

In [27]:
balanced_random_forest_scaled_features = pd.read_csv("features/scaled_balanced_random_forest.csv", index_col=0)
balanced_random_forest_scaled_features = balanced_random_forest_scaled_features[(balanced_random_forest_scaled_features != 0).all(1)]
balanced_random_forest_scaled_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0_level_0,feature_importance_vals
col_name,Unnamed: 1_level_1
SH2D4A,0.022168
GATM,0.015636
CD38,0.013944
NINJ2,0.013250
MCM8,0.011892
...,...
TFRC,0.000016
ACVRL1,0.000015
FOXJ1,0.000014
ISYNA1,0.000014


## Autoencoder

In [3]:
autoencoder_features = pd.read_csv("features/autoencoder_new.csv", index_col=0)
autoencoder_features.sort_values(by="feature_importance_vals", ascending=False)

Unnamed: 0,feature_importance_vals
ESCO2,161.343941
UCHL1,81.589122
CCDC80,77.027331
PLAU,62.571733
TM4SF1,57.958698
...,...
HAS2-AS1,0.051351
CSF2,0.043949
FAM181B,0.038896
SLC6A14,0.036894
