In [7]:
import sys
sys.path.insert(0, '/cndd/fangming/CEMBA/snmcseq_dev')
import importlib

from __init__ import *
from __init__jupyterlab import *
import snmcseq_utils
importlib.reload(snmcseq_utils)
import CEMBA_preproc_utils
importlib.reload(CEMBA_preproc_utils)
import CEMBA_clst_utils

from scipy.io import mmread
from scipy import sparse
import time

import fbpca

In [8]:
import matplotlib
matplotlib.__version__

'2.2.2'

# Task

- start from prepared files 

```metadata``` ```count matrix```
- get and store hvfeatures

# Settings

In [9]:
SRC_DIR = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_neurons'
DST_DIR = '/cndd/fangming/CEMBA/data/MOp_all/data_freeze_neurons'
sys.path.insert(0, SRC_DIR)
from __init__datasets import *

f_meta_format = '{0}/{1}_metadata.tsv'
f_data_format = '{0}/{1}_{2}raw.{3}'
f_hvftr_format = '{0}/{1}_hvfeatures.{2}'
f_ftr_format = '{0}/{1}_features.{2}'


def get_size_in_GB(obj):
    """"""
    GB = 1024**3
    return sys.getsizeof(obj)/GB

In [10]:
mods_selected = [
    'snmcseq_gene',
    'snatac_gene',
    
    'smarter_nuclei',
    'smarter_cells',
    '10x_nuclei_v3_macosko',
    '10x_cells_v3',
    '10x_nuclei_v3',
    '10x_cells_v2',
    
#     '10x_nuclei_v2',
#     'smarter-cells-v1',
]

In [11]:
# # gene name as index
# gene_annot_file = PATH_GENEBODY_ANNOTATION
# gene_annot = pd.read_csv(gene_annot_file, sep="\t")
# gene_annot_v2 = gene_annot.groupby('gene_name').first()
# print(gene_annot_v2.shape)
# gene_lengths_base = (gene_annot_v2['end'] - gene_annot_v2['start'])
# print(gene_lengths_base.head())

In [12]:
# # gene id (abbr) as index
gene_annot_file = PATH_GENEBODY_ANNOTATION
gene_annot = pd.read_csv(gene_annot_file, sep="\t")
gene_annot['gene_id_abbr'] = gene_annot['gene_id'].apply(lambda x: x.split('.')[0])
gene_annot = gene_annot.set_index('gene_id_abbr')

gene_lengths_base = (gene_annot['end'] - gene_annot['start'])
print(gene_lengths_base.head())

gene_id_abbr
ENSMUSG00000102693      1069
ENSMUSG00000064842       109
ENSMUSG00000051951    465597
ENSMUSG00000102851       479
ENSMUSG00000103377      2818
dtype: int64


In [13]:
normalization_options = {
    'smarter_nuclei': 'TPM',
    'smarter_cells': 'TPM',
    'snatac_gene': 'TPM',
    '10x_nuclei_v3_macosko': 'CPM',
    '10x_cells_v3': 'CPM',
    '10x_nuclei_v3': 'CPM',
    'snmcseq_gene': 'MC',
    '10x_cells_v2': 'CPM',
#     '10x_nuclei_v2': 'CPM',
#     'smarter-cells-v1': 'TPM',
}

# highly variable features

In [8]:
for mod in mods_selected:
    ti = time.time()
    print(mod)
    
    # read metadata
    normalization_option = normalization_options[mod]
    f_meta = f_meta_format.format(SRC_DIR, mod) ##
    meta = pd.read_csv(f_meta, sep="\t", index_col=0)
    
    # read data matrix
    if normalization_option == 'MC':
        f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene')
        f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell')
        f_data_c = f_data_format.format(SRC_DIR, mod, 'CH_', 'npz')
        f_data_mc = f_data_format.format(SRC_DIR, mod, 'mCH_', 'npz')
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        gxc_raw = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)
        print(gxc_raw.data['mc'].shape, gxc_raw.data['c'].shape)
        print(time.time()-ti)
        
        # output file
        f_hvftr_data_methylation = f_hvftr_format.format(DST_DIR, mod, 'tsv') 
        print(time.time()-ti)
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
        
        # do
        gxc_hvftr = CEMBA_preproc_utils.preproc_methylation(
                                                            gxc_raw,
                                                            meta,
                                                            global_value_col=settings[mod].global_mean, 
                                                            base_call_cutoff=20, 
                                                            sufficient_coverage_fraction=0.95,
                                                            hv_percentile=30,
                                                            n_qcut=10,
                                                            )
        # save
        print(mod, "Saving to files {}".format(time.time()-ti))
        gxc_hvftr.to_csv(f_hvftr_data_methylation, sep="\t", header=True, index=True, na_rep='NA')
        
        
        
    else:
        f_data = f_data_format.format(SRC_DIR, mod, '', 'npz') 
        f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene') 
        f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell') 
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        gxc_raw = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data) # checked dimensions in agreement internally
        print(gxc_raw.data.shape)
        
        # output files
        f_hvftr_data = f_hvftr_format.format(DST_DIR, mod, 'npz') 
        f_hvftr_gene = f_hvftr_format.format(DST_DIR, mod, 'gene') 
        f_hvftr_cell = f_hvftr_format.format(DST_DIR, mod, 'cell') 
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
    
        # get hvftrs
        print(mod, "Preproc and get highly variable genes {}".format(time.time()-ti))
        if normalization_option == 'CPM':
            gxc_hvftr = CEMBA_preproc_utils.preproc_rna_cpm_based(
                                             gxc_raw, 
                                             sufficient_cell_coverage=0.01, 
                                             hv_percentile=30, hv_ncut=10)
            # save
            print(mod, "Saving to files {}".format(time.time()-ti))
            snmcseq_utils.save_gc_matrix(gxc_hvftr, f_hvftr_gene, f_hvftr_cell, f_hvftr_data)

        elif normalization_option == 'TPM':
            gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)
            gxc_hvftr = CEMBA_preproc_utils.preproc_rna_tpm_based(
                                             gxc_raw, gene_lengths, impute_gene_lengths=True, 
                                             sufficient_cell_coverage=0.01, 
                                             hv_percentile=30, hv_ncut=10)
            # save
            print(mod, "Saving to files {}".format(time.time()-ti))
            snmcseq_utils.save_gc_matrix(gxc_hvftr, f_hvftr_gene, f_hvftr_cell, f_hvftr_data)
    
    
    print(mod, "Total time used: {}".format(time.time()-ti))
    

snmcseq_gene
snmcseq_gene Reading in files 0.05875039100646973
(55487, 9366) (55487, 9366)
34.32368016242981
34.32371425628662
snmcseq_gene Saving to files 106.1889476776123
snmcseq_gene Total time used: 174.43911790847778
snatac_gene
snatac_gene Reading in files 0.23523902893066406
(53278, 54844)
snatac_gene Preproc and get highly variable genes 7.923092603683472
Imputing gene lengths...
Removing low coverage genes...
Getting CPM..
Getting highly variable genes and logCPM...
Getting logTPM...
Trim logTPM matrix...
Number of genes: 6345
snatac_gene Saving to files 55.40879034996033
snatac_gene Total time used: 107.64456629753113


## Check highly-variable genes

In [9]:
for mod in mods_selected:
    print(mod)
    if settings[mod].mod_category == 'mc':
        f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'tsv') 
        gxc_hvftr = pd.read_csv(f_hvftr_data, sep="\t", index_col=0)
        print(gxc_hvftr.index.values)
        print(gxc_hvftr.columns.values)
        print(gxc_hvftr.shape)
        has_nan = np.isnan(gxc_hvftr.values).any()
        print("Contains NaN? {}".format(has_nan))
        
        continue
        
    f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'npz') 
    f_hvftr_gene = f_hvftr_format.format(SRC_DIR, mod, 'gene') 
    f_hvftr_cell = f_hvftr_format.format(SRC_DIR, mod, 'cell') 
    gxc_hvftr = snmcseq_utils.load_gc_matrix(f_hvftr_gene, f_hvftr_cell, f_hvftr_data)
    print(gxc_hvftr.gene)
    print(gxc_hvftr.cell)
    print(len(gxc_hvftr.gene), len(gxc_hvftr.cell), gxc_hvftr.data.shape)
    has_nan = np.isnan(gxc_hvftr.data.data).any()
    print("Contains NaN? {}".format(has_nan))
#     break

snmcseq_gene
['ENSMUSG00000025917' 'ENSMUSG00000079658' 'ENSMUSG00000025940' ...
 'ENSMUSG00000042179' 'ENSMUSG00000053117' 'ENSMUSG00000000266']
['snmcseq_gene_2C_M_0' 'snmcseq_gene_2C_M_1' 'snmcseq_gene_2C_M_100' ...
 'snmcseq_gene_5D_M_997' 'snmcseq_gene_5D_M_998' 'snmcseq_gene_5D_M_999']
(4754, 9366)
Contains NaN? False
snatac_gene
['ENSMUSG00000014782' 'ENSMUSG00000085895' 'ENSMUSG00000112991' ...
 'ENSMUSG00000031441' 'ENSMUSG00000075600' 'ENSMUSG00000009905']
['snatac_gene_CEMBA171206_3C_AGCGATAGAACCAGGTAAGAGATGTATAGCCT'
 'snatac_gene_CEMBA171206_3C_AGCGATAGAACCAGGTAATGACGTCAGGACGT'
 'snatac_gene_CEMBA171206_3C_AGCGATAGAACCAGGTATAGCCTTAGGCGAAG' ...
 'snatac_gene_CEMBA180618_5D_TCCGGAGATTCCATCCGTACTGACAAGAGATG'
 'snatac_gene_CEMBA180618_5D_TCCGGAGATTCCATCCGTACTGACTAAGATCC'
 'snatac_gene_CEMBA180618_5D_TCCGGAGATTCCATCCTATAGCCTCGAATTCC']
6345 54844 (6345, 54844)
Contains NaN? False


### get and save ftrs 

In [17]:
for mod in mods_selected:
    ti = time.time()
    print(mod)
    
    # read metadata
    normalization_option = normalization_options[mod]
    f_meta = f_meta_format.format(SRC_DIR, mod) ##
    meta = pd.read_csv(f_meta, sep="\t", index_col=0)
    
    # read data matrix
    if normalization_option == 'MC':
        f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene')
        f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell')
        f_data_c = f_data_format.format(SRC_DIR, mod, 'CH_', 'npz')
        f_data_mc = f_data_format.format(SRC_DIR, mod, 'mCH_', 'npz')
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        gxc_raw = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)
        print(gxc_raw.data['mc'].shape, gxc_raw.data['c'].shape)
        print(time.time()-ti)
        
        # output file
        f_ftr_data_methylation = f_ftr_format.format(DST_DIR, mod, 'tsv') 
        print(time.time()-ti)
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
        
        ### do
        base_call_cutoff = 10
        sufficient_coverage_fraction = 0.90
        #  select genes covered (10 counts) in > 90% of cells
        n_gene, n_cell = gxc_raw.data['c'].shape
        gene_cov = (gxc_raw.data['c'] > base_call_cutoff).sum(axis=1)
        gene_cov = np.array(gene_cov).squeeze()/n_cell # fraction of cells covered
        cond = gene_cov>sufficient_coverage_fraction

        # to full matrix
        df_c = pd.DataFrame(
            gxc_raw.data['c'].tocsr()[cond, :].todense(),
            index=np.array(gxc_raw.gene)[cond],
            columns=gxc_raw.cell,
        )
        df_mc = pd.DataFrame(
            gxc_raw.data['mc'].tocsr()[cond, :].todense(),
            index=np.array(gxc_raw.gene)[cond],
            columns=gxc_raw.cell,
        )
        ### do
        
        # compute normalized methylation matrix 
        df_mcc = snmcseq_utils.get_mcc_lite_v2(df_c, df_mc, base_call_cutoff=base_call_cutoff)
        gxc_ftr = df_mcc.divide(meta.loc[df_mcc.columns.values, settings[mod].global_mean], axis=1)
        # save
        print(mod, "Saving to files {}".format(time.time()-ti))
        gxc_ftr.to_csv(f_ftr_data_methylation, sep="\t", header=True, index=True, na_rep='NA')
        
        
    else:
        f_data = f_data_format.format(SRC_DIR, mod, '', 'npz') 
        f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene') 
        f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell') 
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        gxc_raw = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data) # checked dimensions in agreement internally
        print(gxc_raw.data.shape)
        
        # output files
        f_ftr_data = f_ftr_format.format(DST_DIR, mod, 'npz') 
        f_ftr_gene = f_ftr_format.format(DST_DIR, mod, 'gene') 
        f_ftr_cell = f_ftr_format.format(DST_DIR, mod, 'cell') 
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
    
        # get hvftrs
        print(mod, "Preproc and get highly variable genes {}".format(time.time()-ti))
        if normalization_option == 'CPM':
            # do 
            gxc_ftr = snmcseq_utils.sparse_logcpm(gxc_raw, mode='logcpm')
            # save
            print(mod, "Saving to files {}".format(time.time()-ti))
            snmcseq_utils.save_gc_matrix(gxc_ftr, f_ftr_gene, f_ftr_cell, f_ftr_data)

        elif normalization_option == 'TPM':
            # do
            gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)
            gene_lengths = gene_lengths.fillna(np.nanmean(gene_lengths.values))
            gxc_ftr = snmcseq_utils.sparse_logtpm(gxc_raw, gene_lengths)
            # save
            print(mod, "Saving to files {}".format(time.time()-ti))
            snmcseq_utils.save_gc_matrix(gxc_ftr, f_ftr_gene, f_ftr_cell, f_ftr_data)
    
    
    print(mod, "Total time used: {}".format(time.time()-ti))
    

snmcseq_gene
snmcseq_gene Reading in files 0.0615079402923584
(55487, 9366) (55487, 9366)
34.14058709144592
34.14061760902405
snmcseq_gene Saving to files 102.06504321098328
snmcseq_gene Total time used: 377.45086431503296
snatac_gene
snatac_gene Reading in files 0.22000694274902344
(53278, 54844)
snatac_gene Preproc and get highly variable genes 7.857074499130249
snatac_gene Saving to files 32.79357719421387
snatac_gene Total time used: 312.04049944877625
smarter_nuclei
smarter_nuclei Reading in files 0.25891900062561035
(32324, 5911)
smarter_nuclei Preproc and get highly variable genes 3.0851221084594727
smarter_nuclei Saving to files 6.6396589279174805
smarter_nuclei Total time used: 44.92380738258362
smarter_cells


  interactivity=interactivity, compiler=compiler, result=result)


smarter_cells Reading in files 0.24219274520874023
(32324, 6244)
smarter_cells Preproc and get highly variable genes 5.207130670547485
smarter_cells Saving to files 12.198362827301025
smarter_cells Total time used: 73.11851859092712
10x_nuclei_v3_macosko
10x_nuclei_v3_macosko Reading in files 0.5683391094207764
(24809, 101647)
10x_nuclei_v3_macosko Preproc and get highly variable genes 37.23923325538635
10x_nuclei_v3_macosko Saving to files 62.449243783950806
10x_nuclei_v3_macosko Total time used: 523.6277034282684
10x_cells_v3
10x_cells_v3 Reading in files 0.3245203495025635
(31053, 69727)
10x_cells_v3 Preproc and get highly variable genes 39.314858198165894
10x_cells_v3 Saving to files 66.13786363601685
10x_cells_v3 Total time used: 600.2839329242706
10x_nuclei_v3
10x_nuclei_v3 Reading in files 0.19624018669128418
(31053, 39706)
10x_nuclei_v3 Preproc and get highly variable genes 10.818140745162964
10x_nuclei_v3 Saving to files 18.068604469299316
10x_nuclei_v3 Total time used: 153.06