# Organize freezed data
Aug 21th, 2019
Fangming Xie

- use reformatted data as input
- extract QC'ed cells only
- make metadata and data consistent
- unique cell name for each dataset: dataset + '_' + cellname (10x_nuclei_v3)

In [1]:
import sys
sys.path.insert(0, '/cndd/fangming/CEMBA/snmcseq_dev')
import importlib

from __init__ import *
from __init__jupyterlab import *
import snmcseq_utils
importlib.reload(snmcseq_utils)
import CEMBA_clst_utils
from scipy.io import mmread
from scipy import sparse
import re
import time


In [2]:
def abbr_cell_names(long_name):
    """Turn a long name: "$barcode-$number$substring" into "$barcode-$number"
    """
    barcode, suffix = long_name.split('-')
    match = re.match(r'^\d+', suffix)
    suffix_short = suffix[match.start():match.end()]
    short_name = '{}-{}'.format(barcode, suffix_short)
    return short_name

In [3]:
SRC_DIR = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted'
DST_DIR = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells'

In [4]:
f = PATH_GENEBODY_ANNOTATION 
df_genes = pd.read_csv(f, sep="\t")
df_genes['gene_id'] = df_genes['gene_id'].apply(lambda x: x.split('.')[0])
df_genes = df_genes.set_index('gene_id')
df_genes_v2 = df_genes.reset_index().groupby('gene_name').first()
print(df_genes.shape)
df_genes.head()
print(df_genes_v2.shape)
df_genes_v2.head()

(53379, 6)
(53278, 6)


Unnamed: 0_level_0,gene_id,chr,start,end,strand,gene_type
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0610005C13Rik,ENSMUSG00000109644,chr7,45567794,45575327,-,antisense_RNA
0610006L08Rik,ENSMUSG00000108652,chr7,74818818,74853813,-,lincRNA
0610009B22Rik,ENSMUSG00000007777,chr11,51685386,51688874,-,protein_coding
0610009E02Rik,ENSMUSG00000086714,chr2,26445696,26459390,+,processed_transcript
0610009L18Rik,ENSMUSG00000043644,chr11,120348678,120351190,+,bidirectional_promoter_lncRNA


### 10x_nuclei_v3_macosko 
- 2 metadata .csv
- 1 data matrix in .rds format
- concatenate male and female, get autosomo genes only

In [11]:
dataset_name = '10x_nuclei_v3_macosko'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data = os.path.join(SRC_DIR, dataset_name+'_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data = os.path.join(DST_DIR, dataset_name+'_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(fout_meta)
print(f_data)
print(fout_data)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_nuclei_v3_macosko_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_nuclei_v3_macosko_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_nuclei_v3_macosko_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_nuclei_v3_macosko_raw.npz


In [12]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
print(meta.shape)
meta.head()

(159738, 18)


Unnamed: 0_level_0,cluster_id,QC,cluster_label,cluster_color,class_label,subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,nUMI,nGene,sex
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
pBICCNsMMrMOpRMiM003d190318_AAACCCACACAAGCTT,90,1,Oligo Opalin_4,#474662,Non-Neuronal,Oligo,16566,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150,4859,2336,MALE
pBICCNsMMrMOpRMiM003d190318_AAAGGATCAGACCGCT,90,19,Oligo Opalin_4,#474662,Non-Neuronal,Oligo,16566,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150,3172,1739,MALE
pBICCNsMMrMOpRMiM003d190318_AAAGGATCATGGAACG,90,20,Oligo Opalin_4,#474662,Non-Neuronal,Oligo,16566,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150,6306,2760,MALE
pBICCNsMMrMOpRMiM003d190318_AAAGGTAGTAATACCC,90,37,Oligo Opalin_4,#474662,Non-Neuronal,Oligo,16566,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150,3310,1806,MALE
pBICCNsMMrMOpRMiM003d190318_AAAGTCCCAAGGTCAG,90,39,Oligo Opalin_4,#474662,Non-Neuronal,Oligo,16566,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150,2843,1543,MALE


In [13]:
gc_mat = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data)
print(gc_mat.data.shape, gc_mat.gene.shape, gc_mat.cell.shape)

(24809, 215823) (24809,) (215823,)


In [14]:
# select cells

selected_cells = meta.index.values
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells)
selected_cells = dataset_name + '_' + selected_cells

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, gc_mat.data.tocsc()[:, selected_cells_id])


In [15]:
# output
# save 
ti = time.time()
snmcseq_utils.save_gc_matrix(gc_mat_update, fout_data_gene, fout_data_cell, fout_data)
print(time.time()-ti)

560.1616470813751


In [16]:
# assemble metadata (keep order)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)

### 10x_nuclei_v3

- gene_ids
- all cells passed QC (QC.csv) metadata

- all cells data
- output

In [5]:
dataset_name = '10x_nuclei_v3'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data = os.path.join(SRC_DIR, dataset_name+'_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data = os.path.join(DST_DIR, dataset_name+'_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(fout_meta)
print(f_data)
print(fout_data)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_nuclei_v3_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_nuclei_v3_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_nuclei_v3_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_nuclei_v3_raw.npz


In [6]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
print(meta.shape)
meta.head()

(40166, 7)


Unnamed: 0_level_0,cluster_id,QC,cluster_label,subclass_label,class_label,cluster_color,size
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACCCAAGCTCTTCC-1L8TX_181211_01_A02,59,1,L6b Shisa6,L6b,Glutamatergic,#2B9880,247
AAAGGGCTCGCGATCG-1L8TX_181211_01_A02,59,47,L6b Shisa6,L6b,Glutamatergic,#2B9880,247
AAAGTGACATCGCTGG-1L8TX_181211_01_A02,59,80,L6b Shisa6,L6b,Glutamatergic,#2B9880,247
AACAACCGTGCTCTCT-1L8TX_181211_01_A02,59,108,L6b Shisa6,L6b,Glutamatergic,#2B9880,247
AACAAGAAGCCTGCCA-1L8TX_181211_01_A02,59,112,L6b Shisa6,L6b,Glutamatergic,#2B9880,247


In [7]:
gc_mat = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data)
print(gc_mat.data.shape, gc_mat.gene.shape, gc_mat.cell.shape)

(31053, 90266) (31053,) (90266,)


In [8]:
selected_cells = meta.index.values
selected_cells_abbr = np.array([abbr_cell_names(cell) for cell in selected_cells])
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells_abbr)
selected_cells = dataset_name + '_' + selected_cells

# check if the reduced names are unique
assert len(np.unique(selected_cells) == len(np.unique(selected_cells_abbr)))
# check if new names mapped to short ones
assert np.all(selected_cells_id != -1)

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, gc_mat.data.tocsc()[:, selected_cells_id])
print(gc_mat_update.data.shape, gc_mat_update.gene.shape, gc_mat_update.cell.shape)

(31053, 40166) (31053,) (40166,)


In [9]:
# output
# save 
ti = time.time()
snmcseq_utils.save_gc_matrix(gc_mat_update, fout_data_gene, fout_data_cell, fout_data)
print(time.time()-ti)

135.43096733093262


In [10]:
# assemble metadata (keep order, add male and female)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)

### 10x_cells_v3 

In [11]:
dataset_name = '10x_cells_v3'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data = os.path.join(SRC_DIR, dataset_name+'_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data = os.path.join(DST_DIR, dataset_name+'_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(fout_meta)
print(f_data)
print(fout_data)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_cells_v3_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_cells_v3_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_cells_v3_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_cells_v3_raw.npz


In [12]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
print(meta.shape)
meta.head()

(71183, 7)


Unnamed: 0_level_0,cluster_id,QC,cluster_label,subclass_label,class_label,cluster_color,size
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACCCAAGCTTCATG-1L8TX_181211_01_G12,42,1,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334
AAACCCACACCAGCCA-1L8TX_181211_01_G12,42,3,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334
AAACGAACAACGATTC-1L8TX_181211_01_G12,42,8,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334
AAACGAATCTCGTGAA-1L8TX_181211_01_G12,42,11,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334
AAACGCTGTAGTCACT-1L8TX_181211_01_G12,42,14,L5 IT Tcap_2,L5 IT,Glutamatergic,#52CA74,17334


In [13]:
gc_mat = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data)
print(gc_mat.data.shape, gc_mat.gene.shape, gc_mat.cell.shape)

(31053, 176584) (31053,) (176584,)


In [14]:
selected_cells = meta.index.values
selected_cells_abbr = np.array([abbr_cell_names(cell) for cell in selected_cells])
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells_abbr)
selected_cells = dataset_name + '_' + selected_cells

# check if the reduced names are unique
assert len(np.unique(selected_cells) == len(np.unique(selected_cells_abbr)))
# check if new names mapped to short ones
assert np.all(selected_cells_id != -1)

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, gc_mat.data.tocsc()[:, selected_cells_id])
print(gc_mat_update.data.shape, gc_mat_update.gene.shape, gc_mat_update.cell.shape)

(31053, 71183) (31053,) (71183,)


In [15]:
# output
# save 
ti = time.time()
snmcseq_utils.save_gc_matrix(gc_mat_update, fout_data_gene, fout_data_cell, fout_data)
print(time.time()-ti)

534.2718126773834


In [16]:
# assemble metadata (keep order, add male and female)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)

### 10x_cells_v2 

In [17]:
dataset_name = '10x_cells_v2'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data = os.path.join(SRC_DIR, dataset_name+'_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data = os.path.join(DST_DIR, dataset_name+'_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(fout_meta)
print(f_data)
print(fout_data)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_cells_v2_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_cells_v2_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/10x_cells_v2_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/10x_cells_v2_raw.npz


In [18]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
print(meta.shape)
meta.head()

(122641, 7)


Unnamed: 0_level_0,cluster_id,QC,cluster_label,subclass_label,class_label,cluster_color,size
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACCTGAGGAGTCTG-1L8TX_171026_01_F03,9,1,Sncg Slc17a8,Sncg,GABAergic,#9440F3,293
ACCTTTAGTACAGCAG-1L8TX_171026_01_F03,9,673,Sncg Slc17a8,Sncg,GABAergic,#9440F3,293
ACGATACCACCCAGTG-1L8TX_171026_01_F03,9,724,Sncg Slc17a8,Sncg,GABAergic,#9440F3,293
ACGGGTCAGTGGGCTA-1L8TX_171026_01_F03,9,885,Sncg Slc17a8,Sncg,GABAergic,#9440F3,293
AGCAGCCCAGTTCATG-1L8TX_171026_01_F03,9,1205,Sncg Slc17a8,Sncg,GABAergic,#9440F3,293


In [19]:
a = np.sort(np.unique([cell.split('-')[1] 
              for cell in meta.index.values]))

b = np.sort(np.unique([cell.split('-')[1]
           for cell in gc_mat.cell]))
print(a, b)

['10L8TX_171026_01_A04' '11L8TX_171026_01_C05' '12L8TX_171026_01_D05'
 '13L8TX_171026_01_E05' '14L8TX_171026_01_F05' '15L8TX_171026_01_G05'
 '16L8TX_171026_01_H05' '1L8TX_171026_01_F03' '2L8TX_171026_01_G03'
 '3L8TX_171026_01_B04' '4L8TX_171026_01_F04' '5L8TX_171026_01_G04'
 '6L8TX_171026_01_B05' '7L8TX_171026_01_H04' '8L8TX_171026_01_A05'
 '9L8TX_171026_01_H03'] ['1' '10' '11' '12' '2' '3' '4' '5' '6' '7' '8' '9']


In [20]:
gc_mat = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data)
print(gc_mat.data.shape, gc_mat.gene.shape, gc_mat.cell.shape)

(31053, 145748) (31053,) (145748,)


In [21]:
selected_cells = meta.index.values
selected_cells_abbr = np.array([abbr_cell_names(cell) for cell in selected_cells])
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells_abbr)
selected_cells = dataset_name + '_' + selected_cells

# check if the reduced names are unique
assert len(np.unique(selected_cells) == len(np.unique(selected_cells_abbr)))
# check if new names mapped to short ones
assert np.all(selected_cells_id != -1)

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, gc_mat.data.tocsc()[:, selected_cells_id])
print(gc_mat_update.data.shape, gc_mat_update.gene.shape, gc_mat_update.cell.shape)

(31053, 122641) (31053,) (122641,)


In [22]:
# output
# save 
ti = time.time()
snmcseq_utils.save_gc_matrix(gc_mat_update, fout_data_gene, fout_data_cell, fout_data)
print(time.time()-ti)

511.95370864868164


In [23]:
# assemble metadata (keep order, add male and female)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)

### Smart-cells

In [5]:
dataset_name = 'smarter_cells'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data = os.path.join(SRC_DIR, dataset_name+'_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data = os.path.join(DST_DIR, dataset_name+'_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(fout_meta)
print(f_data)
print(fout_data)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/smarter_cells_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/smarter_cells_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/smarter_cells_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/smarter_cells_raw.npz


In [6]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
print(meta.shape)
meta.head()

(6288, 128)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,cluster_id,QC,cluster_label,subclass_label,class_label,cluster_color,size,ar_id,exp_component_vendor_name,batch,...,Injection_type,NewBatch,Vendor,MultiPlex,percent_reads_aligned_to_rrna_rmsk,percent_reads_aligned_to_trna_rmsk,percent_reads_aligned_to_ncrna,percent_reads_aligned_to_introns,percent_reads_aligned_to_intergenic,percent_reads_aligned_unique
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LS-15395_S41_E1-50,19,1,Sst Myh8_1,Sst,GABAergic,#FF7F2C,136,543183476,LS-15395-41,R8S4-160812,...,0.0,HistoricalData,Covance,index96,,,15.16298,9.855384,3.264771,84.993663
LS-15395_S42_E1-50,19,2,Sst Myh8_1,Sst,GABAergic,#FF7F2C,136,543183474,LS-15395-42,R8S4-160812,...,0.0,HistoricalData,Covance,index96,,,16.28162,6.763949,2.856108,87.252036
LS-15501_S28_E1-50,19,94,Sst Myh8_1,Sst,GABAergic,#FF7F2C,136,544930823,LS-15501-28,R8S4-160817,...,0.0,HistoricalData,Covance,index96,,,17.066841,3.311346,2.437971,86.831969
LS-15501_S81_E1-50,19,120,Sst Myh8_1,Sst,GABAergic,#FF7F2C,136,544930707,LS-15501-81,R8S4-160817,...,0.0,HistoricalData,Covance,index96,,,13.866006,11.769028,3.699682,85.25259
LS-15524_S21_E1-50,19,267,Sst Myh8_1,Sst,GABAergic,#FF7F2C,136,549999237,LS-15524-21,R8S4-160906,...,0.0,HistoricalData,Covance,index96,,,15.912637,5.923318,2.385434,82.503705


In [7]:
gc_mat = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data)
print(gc_mat.data.shape, gc_mat.gene.shape, gc_mat.cell.shape)

(32324, 6300) (32324,) (6300,)


In [8]:
selected_cells = meta.index.values
selected_cells_abbr = np.array([cell for cell in selected_cells])
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells_abbr)
selected_cells = dataset_name + '_' + selected_cells
assert np.all(selected_cells_id != -1)

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, gc_mat.data.tocsc()[:, selected_cells_id])
print(gc_mat_update.data.shape, gc_mat_update.gene.shape, gc_mat_update.cell.shape)

(32324, 6288) (32324,) (6288,)


In [9]:
# output
# save 
ti = time.time()
snmcseq_utils.save_gc_matrix(gc_mat_update, fout_data_gene, fout_data_cell, fout_data)
print(time.time()-ti)

116.17988157272339


In [10]:
# assemble metadata (keep order, add male and female)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)

### Smarter-nuclei

In [4]:
dataset_name = 'smarter_nuclei'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data = os.path.join(SRC_DIR, dataset_name+'_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data = os.path.join(DST_DIR, dataset_name+'_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(fout_meta)
print(f_data)
print(fout_data)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/smarter_nuclei_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/smarter_nuclei_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/smarter_nuclei_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/smarter_nuclei_raw.npz


In [5]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
print(meta.shape)
meta.head()

(6171, 145)


Unnamed: 0_level_0,cluster_id,QC,cluster_label,subclass_label,class_label,cluster_color,size,ar_id,exp_component_vendor_name,batch,...,pred.cl.2,pred.score.2,ref.cl.2,ref.cl.1.1,pred.cl.3,pred.score.3,cluster_label.2,category_label,Region.1,gene.counts
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SM-GE653_S113_E1-50,2,1,Lamp5 Egln3_2,Lamp5,GABAergic,#E69B73,87,647869537,SM-GE653-S113,R8S4-171101,...,14,0.917036,10,31,14,0.917036,Vip Htr1f,GABAergic,Cortex,7792
SM-GE653_S116_E1-50,2,4,Lamp5 Egln3_2,Lamp5,GABAergic,#E69B73,87,647869531,SM-GE653-S116,R8S4-171101,...,3,0.712671,2,7,3,0.712671,Lamp5 Pdlim5,GABAergic,Cortex,8502
SM-GE653_S117_E1-50,2,5,Lamp5 Egln3_2,Lamp5,GABAergic,#E69B73,87,647869529,SM-GE653-S117,R8S4-171101,...,12,0.89905,9,28,12,0.89905,Vip Chat_3,GABAergic,Cortex,7021
SM-GE653_S120_E1-50,2,8,Lamp5 Egln3_2,Lamp5,GABAergic,#E69B73,87,647869523,SM-GE653-S120,R8S4-171101,...,11,0.892925,9,28,11,0.892925,Vip Chat_2,GABAergic,Cortex,7085
SM-GE653_S122_E1-50,2,10,Lamp5 Egln3_2,Lamp5,GABAergic,#E69B73,87,647869519,SM-GE653-S122,R8S4-171101,...,13,0.822251,9,28,13,0.822251,Vip Lmo1,GABAergic,Cortex,7516


In [8]:
gc_mat = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data)
print(gc_mat.data.shape, gc_mat.gene.shape, gc_mat.cell.shape)

(32324, 6278) (32324,) (6278,)


In [9]:
selected_cells = meta.index.values
selected_cells_abbr = np.array([cell for cell in selected_cells])
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells_abbr)
selected_cells = dataset_name + '_' + selected_cells
assert np.all(selected_cells_id != -1)

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, gc_mat.data.tocsc()[:, selected_cells_id])
print(gc_mat_update.data.shape, gc_mat_update.gene.shape, gc_mat_update.cell.shape)

(32324, 6171) (32324,) (6171,)


In [10]:
# output
# save 
ti = time.time()
snmcseq_utils.save_gc_matrix(gc_mat_update, fout_data_gene, fout_data_cell, fout_data)
print(time.time()-ti)

48.12272572517395


In [11]:
# assemble metadata (keep order, add male and female)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)

### mC

- QC'ed cells
- remove version on ensembl id

In [30]:
dataset_name = 'snmcseq_gene'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data_mc = os.path.join(SRC_DIR, dataset_name+'_mCH_raw.npz') 
f_data_c = os.path.join(SRC_DIR, dataset_name+'_CH_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(f_data_mc)
print(f_data_c)
print(f_data_gene)
print(f_data_cell)

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data_mc = os.path.join(DST_DIR, dataset_name+'_mCH_raw.npz') 
fout_data_c = os.path.join(DST_DIR, dataset_name+'_CH_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 


print(fout_meta)
print(fout_data_mc)
print(fout_data_c)
print(fout_data_gene)
print(fout_data_cell)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/snmcseq_gene_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/snmcseq_gene_mCH_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/snmcseq_gene_CH_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/snmcseq_gene_raw.gene
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/snmcseq_gene_raw.cell
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/snmcseq_gene_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/snmcseq_gene_mCH_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/snmcseq_gene_CH_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/snmcseq_gene_raw.gene
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/snmcseq_gene_raw.cell


In [31]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
# # meta = meta[meta['MajorCluster']!='Outlier']
# print(meta.groupby('MajorCluster').size())
print(meta.shape)
meta.head()

(9876, 31)


Unnamed: 0_level_0,FullCellID,CCC_Rate,CG_Rate,CG_RateAdj,CH_Rate,CH_RateAdj,FinalReads,InputReads,MappedReads,Region,...,Slice,PassFilter,pca_0,pca_1,umap_0,umap_1,tsne_0,tsne_1,MajorCluster,SubCluster
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2C_M_0,180508_CEMBA_mm_P56_P63_2C_CEMBA180409_2C_1_CE...,0.00511,0.77073,0.76955,0.02154,0.01651,1076259.0,2314862,1517338.0,2C,...,2,True,-11.588627,-0.005045,10.051389,7.099836,-3.343437,18.330174,L4-IT-Rorb,L4-IT-Rorb_Rorb-Tenm2
2C_M_1,180508_CEMBA_mm_P56_P63_2C_CEMBA180409_2C_1_CE...,0.00531,0.75683,0.75553,0.02196,0.01674,1197092.0,2590626,1709334.0,2C,...,2,True,-19.267288,-7.374098,13.717241,-1.786627,9.21286,-26.514785,L23-IT-Cux2,L23-IT-Cux2
2C_M_10,180508_CEMBA_mm_P56_P63_2C_CEMBA180409_2C_1_CE...,0.00581,0.7572,0.75578,0.02603,0.02034,1006630.0,2348484,1606410.0,2C,...,2,True,-15.340144,-2.73938,4.829485,9.943986,-22.960995,13.509483,Outlier,Outlier
2C_M_100,180508_CEMBA_mm_P56_P63_2C_CEMBA180409_2C_1_CE...,0.00634,0.74984,0.74824,0.02718,0.02097,2026078.0,4599922,3154619.0,2C,...,2,True,-23.62514,-5.929033,13.739521,1.107174,12.798545,-17.126331,L23-IT-Cux2,L23-IT-Cux2
2C_M_1000,180508_CEMBA_mm_P56_P63_2C_CEMBA180409_2C_3_CE...,0.00521,0.77073,0.76953,0.02117,0.01604,2368464.0,5320502,3648786.0,2C,...,2,True,-21.81519,-7.130878,14.154541,-2.284118,8.443658,-27.943267,L23-IT-Cux2,L23-IT-Cux2


In [34]:
gc_mat = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)
print(gc_mat.data['mc'].shape, gc_mat.data['c'].shape, gc_mat.gene.shape, gc_mat.cell.shape)

(55487, 9941) (55487, 9941) (55487,) (9941,)


In [35]:
selected_cells = meta.index.values
selected_cells_abbr = np.array([cell for cell in selected_cells])
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells_abbr)
selected_cells = dataset_name + '_' + selected_cells
assert np.all(selected_cells_id != -1)

data_update = {'mc': gc_mat.data['mc'].tocsc()[:, selected_cells_id], 
               'c': gc_mat.data['c'].tocsc()[:, selected_cells_id], 
              }

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, data_update)
print(gc_mat_update.data['mc'].shape, 
      gc_mat_update.data['c'].shape, 
      gc_mat_update.gene.shape, gc_mat_update.cell.shape)

(55487, 9876) (55487, 9876) (55487,) (9876,)


In [36]:
# save mc and c tables

snmcseq_utils.save_gc_matrix_methylation(gc_mat_update, 
                                         fout_data_gene, fout_data_cell, 
                                         fout_data_mc, fout_data_c)

In [37]:
# assemble metadata (keep order, add male and female)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)

### ATAC 
- rename barcode and header
- read snap files

In [23]:
dataset_name = 'snatac_gene'

f_meta = os.path.join(SRC_DIR, dataset_name+'_metadata.tsv') 
f_data = os.path.join(SRC_DIR, dataset_name+'_raw.npz') 
f_data_gene = os.path.join(SRC_DIR, dataset_name+'_raw.gene') 
f_data_cell = os.path.join(SRC_DIR, dataset_name+'_raw.cell') 

fout_meta = os.path.join(DST_DIR, dataset_name+'_metadata.tsv') 
fout_data = os.path.join(DST_DIR, dataset_name+'_raw.npz') 
fout_data_gene = os.path.join(DST_DIR, dataset_name+'_raw.gene') 
fout_data_cell = os.path.join(DST_DIR, dataset_name+'_raw.cell') 

print(f_meta)
print(fout_meta)
print(f_data)
print(fout_data)

/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/snatac_gene_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/snatac_gene_metadata.tsv
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_formatted/snatac_gene_raw.npz
/cndd/fangming/CEMBA/data/MOp_all/data_freeze_qc_cells/snatac_gene_raw.npz


In [24]:
meta = pd.read_csv(f_meta, sep="\t", index_col=0)
print(meta.groupby('cluster').size())
print(meta.shape)
meta.head()

cluster
ASC                7215
Chodl                86
Endo               1245
L23.a              6085
L23.b              5758
L23.c              3590
L4                 7307
L5.IT.a            3668
L5.IT.b            4214
L5.PT              1755
L6.CT              8008
L6.IT              4233
Lamp5_Arhgdib       237
Lamp5_Mettl21e      171
Lamp5_Ndnf          204
Lamp5_Smad3         940
MGC                3899
NP                 1291
OGC                8557
OPC                3279
Other              1571
Pv_Ntf3_Trim63     1105
Pv_Tac1            1944
Pv_Vsig2            431
Smc                 586
Sncg                246
Sst_Chrna2_Myh8     538
Sst_Man1a          1355
Sst_Stk33           246
Vip_Chat            500
Vip_Gcnt4           292
Vip_Hcls1           395
Vip_Lipg            245
dtype: int64
(81196, 12)


Unnamed: 0_level_0,sample,barcode,TN,UM,PP,UQ,CM,cluster,tsne1,tsne2,umap-1,umap-2
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CEMBA171206_3C_AGCGATAGAACCAGGTAAGAGATGTATAGCCT,CEMBA171206_3C,AGCGATAGAACCAGGTAAGAGATGTATAGCCT,138330,128263,128039,64868,0,L6.CT,7.605117,0.697672,-2.125204,-4.396653
CEMBA171206_3C_AGCGATAGAACCAGGTAATGACGTCAGGACGT,CEMBA171206_3C,AGCGATAGAACCAGGTAATGACGTCAGGACGT,159133,146663,146348,69167,0,L23.b,-9.492925,-6.200938,8.959074,3.006819
CEMBA171206_3C_AGCGATAGAACCAGGTAGGATAACATAGAGGC,CEMBA171206_3C,AGCGATAGAACCAGGTAGGATAACATAGAGGC,19152,17586,17545,8348,0,ASC,14.227358,-11.023748,-6.880754,-2.903271
CEMBA171206_3C_AGCGATAGAACCAGGTATAGCCTTAGGCGAAG,CEMBA171206_3C,AGCGATAGAACCAGGTATAGCCTTAGGCGAAG,14876,13837,13801,6583,0,L6.CT,5.144768,4.790674,-0.955707,-4.635163
CEMBA171206_3C_AGCGATAGAACCAGGTATAGCCTTATAGAGGC,CEMBA171206_3C,AGCGATAGAACCAGGTATAGCCTTATAGAGGC,28532,26303,26202,12437,0,L5.IT.b,5.218895,-8.674734,0.796851,4.447933


In [25]:
gc_mat = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data)
print(gc_mat.data.shape, gc_mat.gene.shape, gc_mat.cell.shape)

(53278, 135665) (53278,) (135665,)


In [26]:
selected_cells = meta.index.values
selected_cells_abbr = np.array([cell for cell in selected_cells])
selected_cells_id = snmcseq_utils.get_index_from_array(gc_mat.cell, selected_cells_abbr)
selected_cells = dataset_name + '_' + selected_cells
assert np.all(selected_cells_id != -1)

gc_mat_update = GC_matrix(gc_mat.gene, selected_cells, gc_mat.data.tocsc()[:, selected_cells_id])
print(gc_mat_update.data.shape, gc_mat_update.gene.shape, gc_mat_update.cell.shape)

(53278, 81196) (53278,) (81196,)


In [27]:
# output
# save 
ti = time.time()
snmcseq_utils.save_gc_matrix(gc_mat_update, fout_data_gene, fout_data_cell, fout_data)
print(time.time()-ti)

197.4244635105133


In [28]:
# assemble metadata (keep order, add male and female)
meta.index = selected_cells
meta.index.name = 'cell'
meta.to_csv(fout_meta, sep="\t", header=True, index=True)