In [1]:
#!/usr/bin/env python

'''
DESCRIPTION
-----------
    Creating and exporting additional dataset from raw version
    
RETURN
------
    {DATASET}.pck : pck file
        pck version of file

EXPORTED FILE(s) LOCATION
-------------------------
    ./data/external/{EXPERIMENT}/{DATASET}.pck
'''

# importing default libraries
import os, argparse, sys
# sys.path.append('./')
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
os.chdir(ROOT_DIR)
sys.path.append(ROOT_DIR)
# importing scripts in scripts folder
from scripts import config as src
# importing default libraries
import pandas as pd
import numpy as np
import pyreadr # imported to read .rds files
import warnings
warnings.filterwarnings('ignore')

**** scripts/config.py IMPORTED!!!
**** PROJECT FOLDER ,  /home/pgundogdu/projects/signalization_prior_knowledge_based_nn


# MELANOMA
#### Creating the merge dataset from query and reference without Neg.cell samples. This created dataset will use in cross-validation benchmark analysis.

In [2]:
# experiment source
experiment = 'exper_melanoma'
loc = 'processed'
# the output location
loc_output = os.path.join(src.DIR_DATA, loc, experiment)
src.define_folder(loc_=loc_output)

FOLDER information,  /home/pgundogdu/projects/signalization_prior_knowledge_based_nn/data/processed/exper_melanoma/


'/home/pgundogdu/projects/signalization_prior_knowledge_based_nn/data/processed/exper_melanoma/'

In [3]:
df_query = pd.read_pickle(os.path.join(src.DIR_DATA, loc, experiment, 'query_log1p.pck'))
print(df_query.shape)
print(df_query.info())
df_query.head()

(3412, 17995)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3412 entries, 0 to 3411
Columns: 17995 entries, a1bg to cell_type
dtypes: float32(17994), object(1)
memory usage: 234.2+ MB
None


Unnamed: 0,a1bg,a1cf,a2m,a2ml1,a4galt,a4gnt,aaas,aacs,aadac,aadacl2,...,zwint,zxda,zxdb,zxdc,zyg11a,zyg11b,zyx,zzef1,zzz3,cell_type
0,3.292126,0.0,0.0,0.0,0.0,0.0,4.291691,2.209373,0.0,0.0,...,4.599051,0.0,2.819592,2.886475,1.701105,0.438255,3.349202,0.0,0.708036,Neg.cell
1,0.0,0.0,7.886006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neg.cell
2,0.0,0.0,6.300162,0.0,0.0,0.0,5.2867,0.0,0.0,0.0,...,0.0,0.0,0.0,2.066863,2.801541,1.358409,0.0,0.0,0.0,Neg.cell
3,0.0,0.0,6.9894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.495467,0.0,0.0,2.701361,0.0,3.999484,0.0,0.0,3.515121,Neg.cell
4,3.749504,0.0,0.0,1.401183,0.0,0.0,1.247032,0.0,0.0,0.0,...,0.0,0.0,0.0,0.536493,1.4884,1.280934,0.0,0.0,2.457021,Neg.cell


In [4]:
df_reference = pd.read_pickle(os.path.join(src.DIR_DATA, loc, experiment, 'reference_log1p.pck'))
print(df_reference.shape)
print(df_reference.info())
df_reference.head()

(2761, 17995)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2761 entries, 0 to 2760
Columns: 17995 entries, a1bg to cell_type
dtypes: float32(17994), object(1)
memory usage: 189.5+ MB
None


Unnamed: 0,a1bg,a1cf,a2m,a2ml1,a4galt,a4gnt,aaas,aacs,aadac,aadacl2,...,zwint,zxda,zxdb,zxdc,zyg11a,zyg11b,zyx,zzef1,zzz3,cell_type
0,1.877937,0.0,0.0,1.8453,0.0,0.0,1.560248,0.0,0.0,0.0,...,0.0,0.0,2.116256,3.42589,2.857045,2.290513,4.426761,2.416806,3.229618,B.cell
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.826129,2.874129,2.667228,0.0,0.0,0.0,B.cell
2,4.408912,0.0,0.0,0.837247,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.924259,2.323368,1.671473,0.0,0.0,0.0,B.cell
3,0.0,1.163151,1.627278,2.78192,0.0,0.0,0.0,1.355835,0.0,0.0,...,0.0,2.858193,2.241773,4.147411,4.946417,4.313748,3.084201,2.597491,4.20514,B.cell
4,0.0,0.270027,0.0,1.047319,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.879465,3.133754,1.187843,0.0,4.703657,0.0,B.cell


In [5]:
df_merged = pd.concat([df_query, df_reference], axis=0)
print(df_merged.shape)
print(df_merged.info())
df_merged.head()

(6173, 17995)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6173 entries, 0 to 2760
Columns: 17995 entries, a1bg to cell_type
dtypes: float32(17994), object(1)
memory usage: 423.8+ MB
None


Unnamed: 0,a1bg,a1cf,a2m,a2ml1,a4galt,a4gnt,aaas,aacs,aadac,aadacl2,...,zwint,zxda,zxdb,zxdc,zyg11a,zyg11b,zyx,zzef1,zzz3,cell_type
0,3.292126,0.0,0.0,0.0,0.0,0.0,4.291691,2.209373,0.0,0.0,...,4.599051,0.0,2.819592,2.886475,1.701105,0.438255,3.349202,0.0,0.708036,Neg.cell
1,0.0,0.0,7.886006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neg.cell
2,0.0,0.0,6.300162,0.0,0.0,0.0,5.2867,0.0,0.0,0.0,...,0.0,0.0,0.0,2.066863,2.801541,1.358409,0.0,0.0,0.0,Neg.cell
3,0.0,0.0,6.9894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.495467,0.0,0.0,2.701361,0.0,3.999484,0.0,0.0,3.515121,Neg.cell
4,3.749504,0.0,0.0,1.401183,0.0,0.0,1.247032,0.0,0.0,0.0,...,0.0,0.0,0.0,0.536493,1.4884,1.280934,0.0,0.0,2.457021,Neg.cell


In [6]:
## For cross-validation benchmark analysis, the merged dataset without Neg.cell samples will use.
df_merged = df_merged[df_merged['cell_type']!='Neg.cell'].reset_index(drop=True)

df_merged['cell_type'].value_counts()

T.CD8         1759
T.CD4          856
B.cell         818
Macrophage     420
NK              92
Name: cell_type, dtype: int64

In [7]:
df_merged.to_pickle(os.path.join(loc_output, 'query_reference_wo_negcell.pck') )
print('Experiment datasets are exported into ', os.path.join(loc_output, 'query_ref_log1p_wo_negcell.pck' ))

Experiment datasets are exported into  /home/pgundogdu/projects/signalization_prior_knowledge_based_nn/data/processed/exper_melanoma/query_ref_log1p_wo_negcell.pck


# MOUSE
#### Selecting specific cell types in retrieval dataset. This created dataset will use in cross-validation benchmark analysis.

In [8]:
# experiment source
experiment = 'exper_mouse'
loc = 'processed'
# the output location
loc_output = os.path.join(src.DIR_DATA, loc, experiment)
src.define_folder(loc_=loc_output)

FOLDER information,  /home/pgundogdu/projects/signalization_prior_knowledge_based_nn/data/processed/exper_mouse/


'/home/pgundogdu/projects/signalization_prior_knowledge_based_nn/data/processed/exper_mouse/'

In [9]:
df_retrieval = pd.read_pickle(os.path.join(src.DIR_DATA, loc, experiment, 'mouse_retrieval_sw_gw.pck'))
print(df_retrieval.shape)
print(df_retrieval.info())
df_retrieval.head()

(17293, 9438)
<class 'pandas.core.frame.DataFrame'>
Index: 17293 entries, GSM1112490_16cell_1-10_expression.txt to H9_old
Columns: 9438 entries, pycrl to cell_type
dtypes: float32(9437), object(1)
memory usage: 622.8+ MB
None


Unnamed: 0,pycrl,gpr180,gpr182,gpr183,neurl2,neurl4,mfhas1,vps53,vps52,lamc1,...,avpr1b,lcn6,cchcr1,lcn2,rps21,gpx2,bcr,scrt1,adck4,cell_type
GSM1112490_16cell_1-10_expression.txt,-0.044738,-0.376578,-0.285073,-0.213149,-0.225198,-0.362186,-0.473283,-0.084025,-0.292708,-0.45904,...,-0.426253,-0.236591,-0.342968,-0.032137,-0.615552,-0.214001,0.214279,-0.329544,-0.330209,16cell_
GSM1112491_16cell_1-11_expression.txt,0.741378,-0.38489,-0.285073,-0.213149,-0.170043,-0.337133,-0.473283,-0.438886,0.011198,-0.363705,...,-0.426253,-0.236591,-0.342968,-0.032137,-0.551292,-0.214001,-0.43381,-0.329544,-0.34195,16cell_
GSM1112492_16cell_1-12_expression.txt,0.203221,-0.38489,-0.285073,-0.213149,-0.225198,-0.480845,-0.473283,-0.458664,0.209056,-0.454481,...,-0.426253,-0.236591,-0.342968,-0.032137,-0.150011,-0.214001,-0.432487,-0.329544,1.632333,16cell_
GSM1112493_16cell_1-13_expression.txt,-0.445424,-0.329279,-0.285073,-0.213149,-0.225198,-0.478349,-0.473283,0.26006,-0.293903,-0.459193,...,-0.426253,-0.236591,-0.342968,-0.032137,-0.47595,-0.214001,-0.43381,-0.329544,1.034639,16cell_
GSM1112494_16cell_1-14_expression.txt,0.163697,-0.38489,-0.285073,-0.213149,-0.225198,-0.535874,-0.473283,0.003591,-0.292893,-0.43877,...,-0.426253,-0.236591,-0.215517,-0.032137,-0.129736,-0.214001,-0.43381,-0.329544,1.650134,16cell_


In [10]:
df_retrieval['cell_type_detail'] = df_retrieval['cell_type'].copy()
df_retrieval['cell_type'] = df_retrieval['cell_type'].str.split('_', expand=True)[0]

df_retrieval.loc[df_retrieval['cell_type'].isin(['E12', 'E14', 'adult']), 'cell_type'] = 'HSC'
df_retrieval.loc[df_retrieval['cell_type'].isin(['celebral-cortex', 'CNS', 'brain']), 'cell_type'] = 'neuron'
df_retrieval = df_retrieval.loc[df_retrieval['cell_type'].isin(['HSC','4cell','ICM','spleen','8cell','neuron','zygote','2cell','ESC'])]#.reset_index(drop=True)
df_retrieval.drop(columns=['cell_type_detail'], inplace=True)
df_retrieval

Unnamed: 0,pycrl,gpr180,gpr182,gpr183,neurl2,neurl4,mfhas1,vps53,vps52,lamc1,...,avpr1b,lcn6,cchcr1,lcn2,rps21,gpx2,bcr,scrt1,adck4,cell_type
GSM1112540_4cell_1-1_expression.txt,-0.445424,-0.298814,-0.285073,-0.213149,-0.225198,-0.532979,-0.473283,-0.426021,-0.293903,-0.444995,...,-0.426253,-0.236591,-0.342968,-0.032137,-1.011324,-0.214001,-0.263673,-0.329544,0.951015,4cell
GSM1112541_4cell_1-2_expression.txt,-0.084491,-0.295101,-0.285073,-0.213149,-0.207147,-0.509589,-0.473283,-0.161718,-0.266595,-0.206191,...,-0.426253,-0.236591,-0.342968,-0.032137,-1.005358,-0.214001,0.895929,-0.329544,0.294008,4cell
GSM1112542_4cell_1-4_expression.txt,-0.445424,-0.371865,-0.285073,-0.213149,-0.002378,-0.534219,-0.473283,-0.376667,-0.293903,-0.451526,...,-0.426253,-0.236591,-0.329719,-0.032137,-0.321212,-0.214001,0.246973,-0.329544,0.586017,4cell
GSM1112543_4cell_2-1_expression.txt,-0.444256,-0.384890,0.062629,-0.213149,0.312797,-0.535088,-0.473283,-0.321902,-0.048376,-0.456760,...,-0.426253,-0.236591,-0.342968,-0.032137,-0.706778,-0.214001,-0.432033,-0.329544,0.078987,4cell
GSM1112544_4cell_2-2_expression.txt,-0.445424,-0.362601,-0.285073,-0.213149,-0.206647,-0.478962,-0.473283,-0.127703,-0.265070,-0.458710,...,-0.426253,-0.236591,-0.342968,-0.032137,-1.047353,-0.214001,0.752995,-0.329544,1.534359,4cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H5_old,0.422775,-0.320703,-0.285073,-0.213149,-0.225198,-0.535874,-0.473283,-0.459249,-0.291035,-0.459581,...,-0.426253,-0.236591,-0.342968,-0.032137,-1.108760,-0.214001,-0.429125,-0.329544,5.724404,HSC
H6_old,-0.445424,-0.384890,-0.285073,-0.213149,-0.225198,-0.535874,-0.473283,-0.459249,-0.212185,-0.459581,...,-0.426253,-0.236591,-0.342968,-0.032137,-1.078807,-0.214001,-0.433810,-0.329544,-0.345766,HSC
H7_old,-0.445424,-0.365886,-0.285073,-0.209718,-0.225198,-0.530527,-0.416265,-0.442986,-0.293903,-0.459581,...,-0.426253,-0.236591,-0.342968,-0.032137,-0.845498,-0.214001,-0.433810,-0.329544,-0.345766,HSC
H8_old,-0.445424,-0.370000,-0.276631,1.508655,-0.225198,-0.535874,-0.473283,-0.459249,-0.265962,-0.459581,...,-0.422925,-0.233926,-0.342968,-0.031969,-1.032894,-0.214001,-0.433810,-0.325642,-0.333575,HSC


In [None]:
df_retrieval.to_pickle(os.path.join(loc_output, 'mouse_retrieval_sw_gw_cv.pck') )
print('Experiment datasets are exported into ', os.path.join(loc_output, 'mouse_retrieval_sw_gw_cv.pck' ))