In [1]:
#!/usr/bin/env python

'''
DESCRIPTION
-----------
    Creating and exporting additional dataset from raw version
    
RETURN
------
    {DATASET}.pck : pck file
        pck version of file

EXPORTED FILE(s) LOCATION
-------------------------
    ./data/external/{EXPERIMENT}/{DATASET}.pck
'''

# importing default libraries
import os, argparse, sys
# sys.path.append('./')
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
os.chdir(ROOT_DIR)
sys.path.append(ROOT_DIR)
# importing scripts in scripts folder
from scripts import settings as srp
# importing default libraries
import pandas as pd
import numpy as np
import pyreadr # imported to read .rds files
import warnings
warnings.filterwarnings('ignore')

scripts/settings.py - PATHS IMPORTED!!!


# MELANOMA
#### Creating the merge dataset from query and reference without Neg.cell samples. This created dataset will use in cross-validation benchmark analysis.

In [2]:
# experiment source
experiment = 'exper_melanoma'
load_location = 'external'
save_location = 'processed'
# the output location
loc_output = os.path.join(srp.DIR_DATA, save_location, experiment)
srp.define_folder(loc_=loc_output)

'./data/processed/exper_melanoma/'

In [3]:
df_query = pd.read_pickle(os.path.join(srp.DIR_DATA, load_location, experiment, 'query.pck'))
print(df_query.shape)
print(df_query.info())
df_query.head()

(3412, 17995)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3412 entries, 0 to 3411
Columns: 17995 entries, A1BG to label
dtypes: float64(17994), object(1)
memory usage: 468.4+ MB
None


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,label
0,25.9,0.0,0.0,0.0,0.0,0.0,72.09,8.11,0.0,0.0,...,98.39,0.0,15.77,16.93,4.48,0.55,27.48,0.0,1.03,Neg.cell
1,0.0,0.0,2658.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neg.cell
2,0.0,0.0,543.66,0.0,0.0,0.0,196.69,0.0,0.0,0.0,...,0.0,0.0,0.0,6.9,15.47,2.89,0.0,0.0,0.0,Neg.cell
3,0.0,0.0,1084.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,88.61,0.0,0.0,13.9,0.0,53.57,0.0,0.0,32.62,Neg.cell
4,41.5,0.0,0.0,3.06,0.0,0.0,2.48,0.0,0.0,0.0,...,0.0,0.0,0.0,0.71,3.43,2.6,0.0,0.0,10.67,Neg.cell


In [4]:
df_reference = pd.read_pickle(os.path.join(srp.DIR_DATA, load_location, experiment, 'reference.pck'))
print(df_reference.shape)
print(df_reference.info())
df_reference.head()

(2761, 17995)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2761 entries, 0 to 2760
Columns: 17995 entries, A1BG to label
dtypes: float64(17994), object(1)
memory usage: 379.1+ MB
None


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,label
0,5.54,0.0,0.0,5.33,0.0,0.0,3.76,0.0,0.0,0.0,...,0.0,0.0,7.3,29.75,16.41,8.88,82.66,10.21,24.27,B.cell
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,15.88,16.71,13.4,0.0,0.0,0.0,B.cell
2,81.18,0.0,0.0,1.31,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.52,9.21,4.32,0.0,0.0,0.0,B.cell
3,0.0,2.2,4.09,15.15,0.0,0.0,0.0,2.88,0.0,0.0,...,0.0,16.43,8.41,62.27,139.67,73.72,20.85,12.43,66.03,B.cell
4,0.0,0.31,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.55,21.96,2.28,0.0,109.35,0.0,B.cell


In [5]:
df_merged = pd.concat([df_query, df_reference], axis=0)
print(df_merged.shape)
print(df_merged.info())
df_merged.head()

(6173, 17995)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6173 entries, 0 to 2760
Columns: 17995 entries, A1BG to label
dtypes: float64(17994), object(1)
memory usage: 847.5+ MB
None


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,label
0,25.9,0.0,0.0,0.0,0.0,0.0,72.09,8.11,0.0,0.0,...,98.39,0.0,15.77,16.93,4.48,0.55,27.48,0.0,1.03,Neg.cell
1,0.0,0.0,2658.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neg.cell
2,0.0,0.0,543.66,0.0,0.0,0.0,196.69,0.0,0.0,0.0,...,0.0,0.0,0.0,6.9,15.47,2.89,0.0,0.0,0.0,Neg.cell
3,0.0,0.0,1084.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,88.61,0.0,0.0,13.9,0.0,53.57,0.0,0.0,32.62,Neg.cell
4,41.5,0.0,0.0,3.06,0.0,0.0,2.48,0.0,0.0,0.0,...,0.0,0.0,0.0,0.71,3.43,2.6,0.0,0.0,10.67,Neg.cell


In [6]:
## For cross-validation benchmark analysis, the merged dataset without Neg.cell samples will use.
df_merged = df_merged[df_merged['label']!='Neg.cell'].reset_index(drop=True)

In [7]:
df_merged['label'].value_counts()

T.CD8         1759
T.CD4          856
B.cell         818
Macrophage     420
NK              92
Name: label, dtype: int64

In [8]:
df_merged.to_pickle(os.path.join(loc_output, 'query_reference_wo_negcell.pck') )
print('Experiment datasets are exported into ', os.path.join(loc_output, 'query_reference_wo_negcell.pck' ))

Experiment datasets are exported into  ./data/processed/exper_melanoma/query_reference_wo_negcell.pck


# MOUSE
#### Selecting specific cell types in retrieval dataset. This created dataset will use in cross-validation benchmark analysis.

In [9]:
# experiment source
experiment = 'exper_mouse'
load_location = 'external'
save_location = 'processed'
# the output location
loc_output = os.path.join(srp.DIR_DATA, save_location, experiment)
srp.define_folder(loc_=loc_output)

'./data/processed/exper_mouse/'

In [10]:
df_retrieval = pd.read_pickle(os.path.join(srp.DIR_DATA, load_location, experiment, 'mouse_retrieval.pck'))
print(df_retrieval.shape)
print(df_retrieval.info())
df_retrieval.head()

(17293, 9438)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17293 entries, 0 to 17292
Columns: 9438 entries, pycrl to Label
dtypes: object(9438)
memory usage: 1.2+ GB
None


Unnamed: 0,pycrl,gpr180,gpr182,gpr183,neurl2,neurl4,mfhas1,vps53,vps52,lamc1,...,avpr1b,lcn6,cchcr1,lcn2,rps21,gpx2,bcr,scrt1,adck4,Label
0,6.96576497623e-05,3.86496885109e-07,0.0,0.0,0.0,1.15407629203e-05,0.0,4.62904307281e-05,2.08147409876e-07,4.78353343245e-08,...,0,0,0.0,0,0.00176984,0,1.51492e-05,0,7.53505e-07,16cell_
1,0.000206320574247,0.0,0.0,0.0,3.21006322407e-06,1.32053578816e-05,0.0,2.5120972157e-06,5.31570882234e-05,8.475806119e-06,...,0,0,0.0,0,0.00193656,0,0.0,0,1.84855e-07,16cell_
2,0.000112764237663,0.0,0.0,0.0,0.0,3.65637850138e-06,0.0,7.20934621684e-08,8.76296278437e-05,4.50871068288e-07,...,0,0,0.0,0,0.00297763,0,3.09367e-08,0,9.58094e-05,16cell_
3,0.0,2.58595326769e-06,0.0,0.0,0.0,3.82225715459e-06,0.0,8.87392718089e-05,0.0,3.42915006321e-08,...,0,0,0.0,0,0.00213202,0,0.0,0,6.686e-05,16cell_
4,0.000105893856135,0.0,0.0,0.0,0.0,0.0,0.0,5.70997382553e-05,1.75944104717e-07,1.83978702335e-06,...,0,0,6.23004e-06,0,0.00303025,0,0.0,0,9.66723e-05,16cell_


In [14]:
df_retrieval['cell_type'] = df_retrieval['Label'].str.split('_', expand=True)[0]

df_retrieval.loc[df_retrieval.index.isin(np.where(df_retrieval['cell_type'].isin(['E12', 'E14', 'adult']))[0]), 'cell_type'] = 'HSC'
df_retrieval.loc[df_retrieval.index.isin(np.where(df_retrieval['cell_type'].isin(['celebral-cortex', 'CNS', 'brain']))[0]), 'cell_type'] = 'neuron'

df_retrieval = df_retrieval.loc[df_retrieval['cell_type'].isin(['HSC','4cell','ICM','spleen','8cell','neuron','zygote','2cell','ESC'])].reset_index(drop=True)
df_retrieval.drop(columns=['Label'], inplace=True)
print(df_retrieval.shape)
df_retrieval

(13645, 9438)


Unnamed: 0,pycrl,gpr180,gpr182,gpr183,neurl2,neurl4,mfhas1,vps53,vps52,lamc1,...,avpr1b,lcn6,cchcr1,lcn2,rps21,gpx2,bcr,scrt1,adck4,cell_type
0,0.0,4.00261916837e-06,0.0,0.0,0.0,1.92322344185e-07,0.0,4.09920974014e-06,0.0,1.28946810954e-06,...,0,0,0,0,0.000743058,0,3.97698e-06,0,6.28097e-05,4cell
1,6.27466992731e-05,4.1753179017e-06,0.0,0.0,1.05058227227e-06,1.74647143989e-06,0.0,3.67056256664e-05,4.75785491377e-06,2.24006605387e-05,...,0,0,0,0,0.000758537,0,3.10829e-05,0,3.09875e-05,4cell
2,0.0,6.05644550637e-07,0.0,0.0,1.29684472917e-05,1.09936474852e-07,0.0,1.0187892409e-05,0.0,7.12107321589e-07,...,0,0,6.47626e-07,0,0.00253347,0,1.59134e-05,0,4.5131e-05,4cell
3,2.03010829504e-07,0.0,6.46725821662e-06,0.0,3.13121091724e-05,5.22408245094e-08,0.0,1.69441820965e-05,4.2777846865e-05,2.49337794638e-07,...,0,0,0,0,0.00153317,0,4.1549e-08,0,2.05731e-05,4cell
4,0.0,1.03645308164e-06,0.0,0.0,1.07966952671e-06,3.78155085148e-06,0.0,4.0901989502e-05,5.02352050374e-06,7.69671067725e-08,...,0,0,0,0,0.000649586,0,2.77418e-05,0,9.10641e-05,4cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13640,0.00015093271786,2.98475183084e-06,0.0,0.0,0.0,0.0,0.0,0.0,4.99665436074e-07,0.0,...,0,0,0,0,0.000490273,0,1.09511e-07,0,0.000294009,HSC
13641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.42375411212e-05,0.0,...,0,0,0,0,0.000567981,0,0,0,0,HSC
13642,0.0,8.83679739473e-07,0.0,2.87346201e-07,0.0,3.55301578849e-07,2.3874461804e-06,2.00624855951e-06,0.0,0.0,...,0,0,0,0,0.00117327,0,0,0,0,HSC
13643,0.0,6.92532645188e-07,1.57051224911e-07,0.000144234763345,0.0,0.0,0.0,0.0,4.86923373271e-06,0.0,...,1.57051e-07,1.57051e-07,0,1.57051e-07,0.000687253,0,0,1.57051e-07,5.90624e-07,HSC


In [15]:
df_retrieval.to_pickle(os.path.join(loc_output, 'mouse_retrieval_cv.pck') )
print('Experiment datasets are exported into ', os.path.join(loc_output, 'mouse_retrieval_cv.pck' ))

Experiment datasets are exported into  ./data/processed/exper_mouse/mouse_retrieval_cv.pck
