In [1]:
#!/usr/bin/env python

'''
DESCRIPTION
-----------
    Exporting raw dataset into pck
    
RETURN
------
    {DATASET}.pck : pck file
        pck version of file

EXPORTED FILE(s) LOCATION
-------------------------
    ./data/external/{EXPERIMENT}/{DATASET}.pck
'''

# importing default libraries
import os, argparse, sys
# sys.path.append('./')
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
os.chdir(ROOT_DIR)
sys.path.append(ROOT_DIR)
# importing scripts in scripts folder
from scripts import settings as srp
# importing default libraries
import pandas as pd
import numpy as np
import pyreadr # imported to read .rds files
import warnings
warnings.filterwarnings('ignore')

scripts/settings.py - PATHS IMPORTED!!!


# MELANOMA

In [2]:
# experiment source
experiment = 'exper_melanoma'
location = 'external'
# the output location
loc_output = os.path.join(srp.DIR_DATA, location, experiment)
srp.define_folder(loc_=loc_output)

'./data/external/exper_melanoma/'

### reference

In [3]:
dataset = 'reference.rds'
df_raw = pyreadr.read_r(os.path.join(srp.DIR_DATA, location, experiment, dataset))[None]
df_raw = df_raw.reset_index(drop=True)
print(df_raw.shape)
print(df_raw.info())
df_raw

(2761, 17995)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2761 entries, 0 to 2760
Columns: 17995 entries, A1BG to label
dtypes: float64(17994), object(1)
memory usage: 379.1+ MB
None


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,label
0,5.54,0.00,0.00,5.33,0.0,0.0,3.76,0.00,0.0,0.0,...,0.0,0.00,7.30,29.75,16.41,8.88,82.66,10.21,24.27,B.cell
1,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,15.88,16.71,13.40,0.00,0.00,0.00,B.cell
2,81.18,0.00,0.00,1.31,0.0,0.0,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,1.52,9.21,4.32,0.00,0.00,0.00,B.cell
3,0.00,2.20,4.09,15.15,0.0,0.0,0.00,2.88,0.0,0.0,...,0.0,16.43,8.41,62.27,139.67,73.72,20.85,12.43,66.03,B.cell
4,0.00,0.31,0.00,1.85,0.0,0.0,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,5.55,21.96,2.28,0.00,109.35,0.00,B.cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,20.88,15.85,3.98,0.00,0.00,0.00,T.CD8
2757,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,0.17,5.77,1.18,0.00,0.00,0.00,T.CD8
2758,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,2.09,4.14,4.95,0.00,0.00,0.00,T.CD8
2759,0.00,0.00,0.00,1.06,0.0,0.0,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,4.37,6.24,7.48,229.28,0.00,0.00,T.CD8


In [4]:
df_raw.to_pickle(os.path.join(loc_output, (dataset.split('.')[0]+'.pck') ))
print('Experiment datasets are exported into ', os.path.join(loc_output, (dataset.split('.')[0]+'.pck') ))

Experiment datasets are exported into  ./data/external/exper_melanoma/reference.pck


### query

In [5]:
dataset = 'query.rds'
df_raw = pyreadr.read_r(os.path.join(srp.DIR_DATA, location, experiment, dataset))[None]
df_raw = df_raw.reset_index(drop=True)
print(df_raw.shape)
print(df_raw.info())
df_raw

(3412, 17995)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3412 entries, 0 to 3411
Columns: 17995 entries, A1BG to label
dtypes: float64(17994), object(1)
memory usage: 468.4+ MB
None


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,label
0,25.9,0.00,0.00,0.00,0.0,0.0,72.09,8.11,0.0,0.0,...,98.39,0.0,15.77,16.93,4.48,0.55,27.48,0.0,1.03,Neg.cell
1,0.0,0.00,2658.80,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.00,Neg.cell
2,0.0,0.00,543.66,0.00,0.0,0.0,196.69,0.00,0.0,0.0,...,0.00,0.0,0.00,6.90,15.47,2.89,0.00,0.0,0.00,Neg.cell
3,0.0,0.00,1084.07,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,88.61,0.0,0.00,13.90,0.00,53.57,0.00,0.0,32.62,Neg.cell
4,41.5,0.00,0.00,3.06,0.0,0.0,2.48,0.00,0.0,0.0,...,0.00,0.0,0.00,0.71,3.43,2.60,0.00,0.0,10.67,Neg.cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3407,0.0,2.01,0.00,4.58,0.0,0.0,0.00,0.00,0.0,0.0,...,0.00,0.0,0.00,7.70,22.90,14.07,0.00,0.0,0.00,T.CD8
3408,0.0,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,0.00,0.0,0.00,76.55,7.58,12.39,0.00,0.0,0.00,T.CD8
3409,0.0,0.00,0.00,0.00,0.0,0.0,389.42,0.00,0.0,0.0,...,1042.17,0.0,0.00,0.00,0.00,0.00,269.00,0.0,0.00,T.CD8
3410,0.0,0.00,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.0,...,0.00,0.0,0.00,11.26,2.59,18.66,56.50,0.0,0.00,T.CD8


In [6]:
df_raw.to_pickle(os.path.join(loc_output, (dataset.split('.')[0]+'.pck') ))
print('Experiment datasets are exported into ', os.path.join(loc_output, (dataset.split('.')[0]+'.pck') ))

Experiment datasets are exported into  ./data/external/exper_melanoma/query.pck


# MOUSE

In [2]:
# experiment source
experiment = 'exper_mouse'
location = 'external'
# the output location
loc_output = os.path.join(srp.DIR_DATA, location, experiment)
srp.define_folder(loc_=loc_output)

'./data/external/exper_mouse/'

### learning

In [3]:
df_raw = pd.read_csv(os.path.join(srp.DIR_DATA, location, experiment, '1-3_integrated_NNtraining.txt')
                             , sep='\t'
                             , index_col=0).T.drop(columns='Weight')#.reset_index()
# Reordering dataset, cell type goes to the last column
df_raw = df_raw[df_raw.columns.to_list()[1:] + df_raw.columns.to_list()[:1]]
df_raw = df_raw.reset_index(drop=True)
print(df_raw.shape)
print(df_raw.info())
df_raw

(402, 9438)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Columns: 9438 entries, pycrl to Label
dtypes: object(9438)
memory usage: 28.9+ MB
None


Sample,pycrl,gpr180,gpr182,gpr183,neurl2,neurl4,mfhas1,vps53,vps52,lamc1,...,avpr1b,lcn6,cchcr1,lcn2,rps21,gpx2,bcr,scrt1,adck4,Label
0,59.88926581,0.0,0.0,0.190303945,0.0,0.0,22.928124,37.38295246,2.299521764,0.0,...,0,0,0,63.8601,136.315,0,0,0,0,BMDC
1,0.0,0.0,0.0,0.04988755,0.0,0.0,0.0,9.073019116,0.0,0.018777023,...,0,0,0,874.017,5.63192,0,0,0,0,BMDC
2,62.44607307,0.0,0.0,0.140584482,0.0,0.0,6.438339344,0.192870387,0.0,0.0,...,0,0,0,0.537095,34.0609,0,0,0,0,BMDC
3,0.197837433,0.0,0.0,0.043340693,0.0,0.0,0.0,0.105061601,0.0,0.0,...,0,0,0,0.331656,22.3573,0,0,0,0.109427,BMDC
4,51.44995646,0.0,0.0,0.0353156,0.0,0.0,0.041080904,0.289719541,0.0,0.0,...,0,0,0,0,42.7051,0,0,0,0.089052,BMDC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,42.2231879027,0.517287444371,0.0,0.0,0.0,21.4891494129,0.0,2.53588460005,22.366299781,29.0983805433,...,0,0,0.355592,0,1957.8,0,0.256046,0,0,midblast
398,57.7754969977,4.11364443932,0.0,0.0,2.55353847465,5.28117353207,0.0,15.7028774526,4.92583358489,16.31496068,...,0,0.0917621,1.69562,0,1746.84,0.0792212,6.11135,0,0.209084,lateblast
399,51.0122780928,26.6770427424,0.0,0.0,6.55671776052,13.0176070937,0.0,42.8203984673,39.5951369592,9.09963949569,...,0,0,0,0,1380.53,0,1.61826,0,0.104436,midblast
400,19.4702519286,0.0676517381748,0.0,0.0,0.469816026076,106.615886105,0.0,1.63872933101,2.84181304378,42.3005135154,...,0.277858,0,0,0,1546.28,0,0.439507,0,0.263785,midblast


In [5]:
df_raw.to_pickle(os.path.join(loc_output, 'mouse_learning.pck'))
print('Experiment datasets are exported into ', os.path.join(loc_output, 'mouse_learning.pck' ))

Experiment datasets are exported into  ./data/external/exper_mouse/mouse_learning.pck


### retrieval

In [7]:
df_raw = pd.read_csv(os.path.join(srp.DIR_DATA, location, experiment, '3-33_integrated_retrieval_set.txt'), sep='\t').T
# Create a new variable called 'header' from the first row of the dataset
header = df_raw.iloc[0]
# Rename the dataframe's column values with the header variable
df_raw.columns = list(header.values)
# Replace the dataframe with a new one which does not contain the first row
df_raw = df_raw[1:]
# Delete 'Dataset' column
df_raw.drop(columns=['Dataset'], inplace=True)
# Reordering dataset, cell type goes to the last column
df_raw = df_raw[df_raw.columns.to_list()[1:] + df_raw.columns.to_list()[:1]]
df_raw = df_raw.reset_index(drop=True)
print(df_raw.shape)
print(df_raw.info())
df_raw

(17293, 9438)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17293 entries, 0 to 17292
Columns: 9438 entries, pycrl to Label
dtypes: object(9438)
memory usage: 1.2+ GB
None


Unnamed: 0,pycrl,gpr180,gpr182,gpr183,neurl2,neurl4,mfhas1,vps53,vps52,lamc1,...,avpr1b,lcn6,cchcr1,lcn2,rps21,gpx2,bcr,scrt1,adck4,Label
0,6.96576497623e-05,3.86496885109e-07,0.0,0.0,0.0,1.15407629203e-05,0.0,4.62904307281e-05,2.08147409876e-07,4.78353343245e-08,...,0,0,0,0,0.00176984,0,1.51492e-05,0,7.53505e-07,16cell_
1,0.000206320574247,0.0,0.0,0.0,3.21006322407e-06,1.32053578816e-05,0.0,2.5120972157e-06,5.31570882234e-05,8.475806119e-06,...,0,0,0,0,0.00193656,0,0,0,1.84855e-07,16cell_
2,0.000112764237663,0.0,0.0,0.0,0.0,3.65637850138e-06,0.0,7.20934621684e-08,8.76296278437e-05,4.50871068288e-07,...,0,0,0,0,0.00297763,0,3.09367e-08,0,9.58094e-05,16cell_
3,0.0,2.58595326769e-06,0.0,0.0,0.0,3.82225715459e-06,0.0,8.87392718089e-05,0.0,3.42915006321e-08,...,0,0,0,0,0.00213202,0,0,0,6.686e-05,16cell_
4,0.000105893856135,0.0,0.0,0.0,0.0,0.0,0.0,5.70997382553e-05,1.75944104717e-07,1.83978702335e-06,...,0,0,6.23004e-06,0,0.00303025,0,0,0,9.66723e-05,16cell_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17288,0.00015093271786,2.98475183084e-06,0.0,0.0,0.0,0.0,0.0,0.0,4.99665436074e-07,0.0,...,0,0,0,0,0.000490273,0,1.09511e-07,0,0.000294009,HSC_old
17289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.42375411212e-05,0.0,...,0,0,0,0,0.000567981,0,0,0,0,HSC_old
17290,0.0,8.83679739473e-07,0.0,2.87346201e-07,0.0,3.55301578849e-07,2.3874461804e-06,2.00624855951e-06,0.0,0.0,...,0,0,0,0,0.00117327,0,0,0,0,HSC_old
17291,0.0,6.92532645188e-07,1.57051224911e-07,0.000144234763345,0.0,0.0,0.0,0.0,4.86923373271e-06,0.0,...,1.57051e-07,1.57051e-07,0,1.57051e-07,0.000687253,0,0,1.57051e-07,5.90624e-07,HSC_old


In [None]:
df_raw.to_pickle(os.path.join(loc_output, 'mouse_retrieval.pck'))
print('Experiment datasets are exported into ', os.path.join(loc_output, 'mouse_retrieval.pck' ))

## PBMC

In [12]:
# experiment source
experiment = 'exper_pbmc'
location = 'external'
# the output location
loc_output = os.path.join(srp.DIR_DATA, location, experiment)
srp.define_folder(loc_=loc_output)

'./data/external/exper_pbmc/'

## Dataset from Github

In [13]:
dataset = 'Immune.20210314.rds.gz'
df_raw = pyreadr.read_r(os.path.join(srp.DIR_DATA, location, experiment, dataset))[None]
# df_raw = pd.DataFrame(pyread_immune.get(None))
df_raw = df_raw.reset_index(drop=True)
print(df_raw.shape)
print(df_raw.info())
df_raw

(17500, 15701)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17500 entries, 0 to 17499
Columns: 15701 entries, SAMD11 to label
dtypes: float64(15700), object(1)
memory usage: 2.0+ GB
None


Unnamed: 0,SAMD11,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AGRN,C1orf159,TTLL10,TNFRSF18,...,SPATC1L,LSS,MCM3AP,YBEY,C21orf58,PCNT,DIP2A,S100B,PRMT2,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CD14
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CD14
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,11.655012,0.0,0.0,0.0,0.0,0.0,0.0,CD14
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CD14
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.250694,0.0,0.0,0.0,0.0,0.0,0.0,CD14


In [14]:
df_raw.to_pickle(os.path.join(loc_output, (dataset.split('.')[0]+'.pck') ))
print('Experiment datasets are exported into ', os.path.join(loc_output, (dataset.split('.')[0]+'.pck') ))

Experiment datasets are exported into  ./data/external/exper_pbmc/Immune.pck


## Data with Magic implemented

In [15]:
dataset = 'Immune.20210314.magic.tsv.tar.gz'
df_raw = pd.read_csv(os.path.join(srp.DIR_DATA, location, experiment, dataset), sep='\t')
df_raw = df_raw.reset_index(drop=True)
print(df_raw.shape)
print(df_raw.info())
df_raw

(17500, 13521)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17500 entries, 0 to 17499
Columns: 13521 entries, SAMD11 to label
dtypes: float64(13520), object(1)
memory usage: 1.8+ GB
None


Unnamed: 0,SAMD11,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AGRN,C1orf159,TTLL10,TNFRSF18,...,SPATC1L,LSS,MCM3AP,YBEY,C21orf58,PCNT,DIP2A,S100B,PRMT2,label
0,0.001102,0.151281,0.003892,2.1e-05,0.110581,0.763474,0.002906,0.004773,0.0,0.0,...,0.138263,0.007843,0.099817,0.059949,0.002743,0.011917,0.071379,0.006207,0.2695,CD14
1,0.003482,0.090237,0.009713,0.002148,0.171235,0.782739,0.008832,0.003012,0.0,0.0,...,0.069647,0.012341,0.113625,0.052511,0.0048,0.024374,0.023939,0.010964,0.248543,CD14
2,0.004099,0.07833,0.007824,0.002586,0.155797,0.74669,0.005897,0.002826,0.0,0.0,...,0.061309,0.011119,0.166501,0.03843,0.001888,0.024416,0.026413,0.010072,0.227426,CD14
3,0.0,0.360424,0.001101,0.0,0.073736,0.825764,0.0,0.000986,0.0,0.000209,...,0.158655,0.0002,0.134345,0.094934,0.0,0.061235,0.023584,0.0733,0.454011,CD14
4,0.000543,0.146004,0.003606,0.0,0.087482,0.877019,0.000383,0.005977,0.0,0.0,...,0.147623,0.007613,0.187692,0.09155,0.00096,0.009801,0.083244,0.004845,0.230089,CD14


In [16]:
# Exporting preprocessed dataset
df_raw.to_pickle(os.path.join(loc_output, (dataset.split('.')[0]+'_magic.pck') ))
print('Experiment datasets are exported into ', os.path.join(loc_output, (dataset.split('.')[0]+'_magic.pck') ))

Experiment datasets are exported into  ./data/external/exper_pbmc/Immune_magic.pck
