In [1]:
import sys
sys.path.insert(0, '/cndd/fangming/CEMBA/snmcseq_dev')
import importlib

from __init__ import *
from __init__jupyterlab import *
import snmcseq_utils
importlib.reload(snmcseq_utils)
import CEMBA_preproc_utils
importlib.reload(CEMBA_preproc_utils)
import CEMBA_clst_utils

from scipy.io import mmread
from scipy import sparse
import time

import fbpca

In [2]:
import matplotlib
matplotlib.__version__

'2.2.2'

# Task

- start from prepared files 

```metadata``` ```count matrix```
- get and store hvfeatures

# Settings

In [3]:
SRC_DIR = '/cndd2/fangming/projects/miniatlas/data/data_freeze_it_v2/'
DST_DIR = '/cndd2/fangming/projects/miniatlas/data/data_freeze_it_v2/'
sys.path.insert(0, SRC_DIR)
from __init__datasets import *

f_meta_format = '{0}/{1}_metadata.tsv'
f_data_format = '{0}/{1}_{2}raw.{3}'
f_hvftr_format = '{0}/{1}_hvfeatures.{2}'
f_ftr_format = '{0}/{1}_features.{2}'

def get_size_in_GB(obj):
    """"""
    GB = 1024**3
    return sys.getsizeof(obj)/GB

In [4]:
mods_selected = [
    'snmcseq_gene',
    'snatac_gene',
    'smarter_nuclei',
    'smarter_cells',
    '10x_nuclei_v3_macosko',
    '10x_cells_v3',
    '10x_nuclei_v3',
    '10x_cells_v2',
    'patchseq',
]

In [5]:
# # gene id (abbr) as index
gene_annot_file = PATH_GENEBODY_ANNOTATION
gene_annot = pd.read_csv(gene_annot_file, sep="\t")
gene_annot['gene_id_abbr'] = gene_annot['gene_id'].apply(lambda x: x.split('.')[0])
gene_annot = gene_annot.set_index('gene_id_abbr')

gene_lengths_base = (gene_annot['end'] - gene_annot['start'])
print(gene_lengths_base.head())

gene_id_abbr
ENSMUSG00000102693      1069
ENSMUSG00000064842       109
ENSMUSG00000051951    465597
ENSMUSG00000102851       479
ENSMUSG00000103377      2818
dtype: int64


In [6]:
normalization_options = {
    'smarter_nuclei': 'TPM',
    'smarter_cells': 'TPM',
    'snatac_gene': 'TPM',
    '10x_nuclei_v3_macosko': 'CPM',
    '10x_cells_v3': 'CPM',
    '10x_nuclei_v3': 'CPM',
    'snmcseq_gene': 'MC',
    '10x_cells_v2': 'CPM',
    'patchseq': 'TPM',
}

# highly variable features

In [7]:
for mod in mods_selected:
    ti = time.time()
    print(mod)
    
    # read metadata
    normalization_option = normalization_options[mod]
    f_meta = f_meta_format.format(SRC_DIR, mod) ##
    meta = pd.read_csv(f_meta, sep="\t", index_col=0)
    
    # read data matrix
    if normalization_option == 'MC':
        f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene')
        f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell')
        f_data_c = f_data_format.format(SRC_DIR, mod, 'CH_', 'npz')
        f_data_mc = f_data_format.format(SRC_DIR, mod, 'mCH_', 'npz')
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        gxc_raw = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)
        print(gxc_raw.data['mc'].shape, gxc_raw.data['c'].shape)
        print(time.time()-ti)
        
        # output file
        f_hvftr_data_methylation = f_hvftr_format.format(DST_DIR, mod, 'tsv') 
        print(time.time()-ti)
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
        
        # do
        gxc_hvftr = CEMBA_preproc_utils.preproc_methylation(
                                                            gxc_raw,
                                                            meta,
                                                            global_value_col=settings[mod].global_mean, 
                                                            base_call_cutoff=20, 
                                                            sufficient_coverage_fraction=0.95,
                                                            hv_percentile=30,
                                                            n_qcut=10,
                                                            )
        # save
        print(mod, "Saving to files {}".format(time.time()-ti))
        gxc_hvftr.to_csv(f_hvftr_data_methylation, sep="\t", header=True, index=True, na_rep='NA')
        
        
        
    else:
        f_data = f_data_format.format(SRC_DIR, mod, '', 'npz') 
        f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene') 
        f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell') 
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        gxc_raw = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data) # checked dimensions in agreement internally
        print(gxc_raw.data.shape)
        
        # output files
        f_hvftr_data = f_hvftr_format.format(DST_DIR, mod, 'npz') 
        f_hvftr_gene = f_hvftr_format.format(DST_DIR, mod, 'gene') 
        f_hvftr_cell = f_hvftr_format.format(DST_DIR, mod, 'cell') 
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
    
        # get hvftrs
        print(mod, "Preproc and get highly variable genes {}".format(time.time()-ti))
        if normalization_option == 'CPM':
            gxc_hvftr = CEMBA_preproc_utils.preproc_rna_cpm_based(
                                             gxc_raw, 
                                             sufficient_cell_coverage=0.01, 
                                             hv_percentile=30, hv_ncut=10)
            # save
            print(mod, "Saving to files {}".format(time.time()-ti))
            snmcseq_utils.save_gc_matrix(gxc_hvftr, f_hvftr_gene, f_hvftr_cell, f_hvftr_data)

        elif normalization_option == 'TPM':
            gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)
            gxc_hvftr = CEMBA_preproc_utils.preproc_rna_tpm_based(
                                             gxc_raw, gene_lengths, impute_gene_lengths=True, 
                                             sufficient_cell_coverage=0.01, 
                                             hv_percentile=30, hv_ncut=10)
            # save
            print(mod, "Saving to files {}".format(time.time()-ti))
            snmcseq_utils.save_gc_matrix(gxc_hvftr, f_hvftr_gene, f_hvftr_cell, f_hvftr_data)
    
    
    print(mod, "Total time used: {}".format(time.time()-ti))
    

snmcseq_gene
snmcseq_gene Reading in files 0.03411746025085449
(55487, 5339) (55487, 5339)
19.660736560821533
19.66075611114502
snmcseq_gene Saving to files 57.82907247543335
snmcseq_gene Total time used: 94.03031349182129
snatac_gene
snatac_gene Reading in files 0.17479276657104492
(53278, 32967)
snatac_gene Preproc and get highly variable genes 5.035203218460083
Imputing gene lengths...
Removing low coverage genes...
Getting CPM..
Getting highly variable genes and logCPM...
Getting logTPM...
Trim logTPM matrix...
Number of genes: 6318
snatac_gene Saving to files 31.258440494537354
snatac_gene Total time used: 63.72266435623169
smarter_nuclei
smarter_nuclei Reading in files 0.15886998176574707
(32324, 2304)
smarter_nuclei Preproc and get highly variable genes 1.385819435119629
Imputing gene lengths...
Removing low coverage genes...
Getting CPM..
Getting highly variable genes and logCPM...
Getting logTPM...
Trim logTPM matrix...
Number of genes: 5373
smarter_nuclei Saving to files 4.04

In [8]:
# f = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_l5pt/merfish_metadata.tsv'
# meta = pd.read_csv(f, sep='\t', index_col=0)

# fcell = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_l5pt/merfish_hvfeatures.cell'
# fgene = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_l5pt/merfish_hvfeatures.gene'
# fmat = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_l5pt/merfish_hvfeatures.npz'
# gc_mat = snmcseq_utils.load_gc_matrix(fgene, fcell, fmat)

# assert np.all(meta.index.values == gc_mat.cell)

In [9]:
# f = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_l5pt/epi_retro_metadata.tsv'
# meta = pd.read_csv(f, sep='\t', index_col=0)
# fmat = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_l5pt/epi_retro_hvfeatures.tsv'
# gc_mat = pd.read_csv(fmat, sep='\t', index_col=0)

# assert np.all(meta.index.values == gc_mat.columns.values)