In [1]:
import sys
import importlib
sys.path.insert(0, '../scripts')

import numpy as np
from scipy import sparse
import time
import re
import anndata

from __global_variables import *
from utils_new import *
import basic_utils
importlib.reload(basic_utils)
import preproc_utils
importlib.reload(preproc_utils)

  from pandas.core.index import RangeIndex


<module 'preproc_utils' from '../scripts/preproc_utils.py'>

# Task

- start from prepared files 
```anndata```
- get and store hvfeatures
```anndata```

# Settings

In [2]:
def get_size_in_GB(obj):
    """"""
    GB = 1024**3
    return sys.getsizeof(obj)/GB

In [3]:
SRC_DIR = './datasets_pre'
DST_DIR = './datasets'

sys.path.insert(0, DST_DIR)
from __init__datasets import *


f_data_format = '{0}/{1}.h5ad'
f_hvftr_data_format = '{0}/{1}.h5ad'



In [4]:
mods_selected = [
    'snatac',
]

normalization_options = {
    'snatac': 'TPM',
}

In [5]:
df_genes = get_gene_annotation().set_index('ensid')

gene_lengths_base = (df_genes['end'] - df_genes['start'])
print(gene_lengths_base.shape)
gene_lengths_base.head()

(32285,)


ensid
ENSMUSG00000051951    465597
ENSMUSG00000089699     46966
ENSMUSG00000102331     11595
ENSMUSG00000102343     80476
ENSMUSG00000025900    409684
dtype: int64

# highly variable features

In [6]:
for mod in mods_selected:
    ti = time.time()
    print(mod)
    
    normalization_option = normalization_options[mod]
    # read data matrix
    if normalization_option == 'MC':
        f_data = f_data_format.format(SRC_DIR, mod)
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        gxc_raw = snmcseq_utils.load_gc_matrix_methylation(f_data_gene, f_data_cell, f_data_mc, f_data_c)
        print(gxc_raw.data['mc'].shape, gxc_raw.data['c'].shape)
        print(time.time()-ti)
        
        # output file
        f_hvftr_data_methylation = f_hvftr_format.format(DST_DIR, mod, 'tsv') 
        print(time.time()-ti)
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
        # do
        gxc_hvftr = preproc_utils.preproc_methylation(
                                                      gxc_raw,
                                                      meta,
                                                      global_value_col=settings[mod].global_mean, 
                                                      base_call_cutoff=20, 
                                                      sufficient_coverage_fraction=0.95,
                                                      hv_percentile=30,
                                                      n_qcut=10,
                                                      )
        # save
        print(mod, "Saving to files {}".format(time.time()-ti))
#         gxc_hvftr.to_csv(f_hvftr_data_methylation, sep="\t", header=True, index=True, na_rep='NA')
        h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')
        
    else:
        # input, output files
        f_data = f_data_format.format(SRC_DIR, mod,) 
        f_hvftr_data = f_hvftr_data_format.format(DST_DIR, mod) 
        
        # read in files
        print(mod, "Reading in files {}".format(time.time()-ti))
        h5ad_mat = anndata.read_h5ad(f_data)
        gid_col, cid_col = 'ensid', ''
        meta, gxc_raw = basic_utils.h5ad_to_scf_rna_format(h5ad_mat, gid_col, cid_col)
        
        # check meta cells agree with gxc cells
        assert np.all(meta.index.values == gxc_raw.cell)
        # check genes are uniq 
        assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) 
    
        # get hvftrs
        print(mod, "Preproc and get highly variable genes {}".format(time.time()-ti))
        if normalization_option == 'CPM':
            gxc_hvftr = preproc_utils.preproc_rna_cpm_based(
                                             gxc_raw, 
                                             sufficient_cell_coverage=0.01, 
                                             hv_percentile=30, hv_ncut=10)
        elif normalization_option == 'TPM':
            gene_lengths = gene_lengths_base.reindex(gxc_raw.gene)
            gxc_hvftr = preproc_utils.preproc_rna_tpm_based(
                                             gxc_raw, gene_lengths, impute_gene_lengths=True, 
                                             sufficient_cell_coverage=0.01, 
                                             hv_percentile=30, hv_ncut=10)
    
        # save
        print(mod, "Saving to file {}".format(f_hvftr_data, time.time()-ti))
        h5ad_mat_hvftr = basic_utils.scf_rna_format_to_h5ad(meta, gxc_hvftr)
        h5ad_mat_hvftr.write(f_hvftr_data, compression='gzip')
    
    print(mod, "Total time used: {}".format(time.time()-ti))
    

snatac
snatac Reading in files 7.104873657226562e-05
snatac Preproc and get highly variable genes 1.8358633518218994
Imputing gene lengths...
Removing low coverage genes...
Getting CPM..
Getting highly variable genes and logCPM...
Getting logTPM...
Trim logTPM matrix...
Number of genes: 5427
snatac Saving to file ./datasets/snatac.h5ad
snatac Total time used: 11.084995746612549


## Check highly-variable genes

In [7]:
# for mod in mods_selected:
#     print(mod)
#     if settings[mod].mod_category == 'mc':
#         f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'tsv') 
#         gxc_hvftr = pd.read_csv(f_hvftr_data, sep="\t", index_col=0)
#         print(gxc_hvftr.index.values)
#         print(gxc_hvftr.columns.values)
#         print(gxc_hvftr.shape)
#         has_nan = np.isnan(gxc_hvftr.values).any()
#         print("Contains NaN? {}".format(has_nan))
        
#         continue
        
#     f_hvftr_data = f_hvftr_format.format(SRC_DIR, mod, 'npz') 
#     f_hvftr_gene = f_hvftr_format.format(SRC_DIR, mod, 'gene') 
#     f_hvftr_cell = f_hvftr_format.format(SRC_DIR, mod, 'cell') 
#     gxc_hvftr = snmcseq_utils.load_gc_matrix(f_hvftr_gene, f_hvftr_cell, f_hvftr_data)
#     print(gxc_hvftr.gene)
#     print(gxc_hvftr.cell)
#     print(len(gxc_hvftr.gene), len(gxc_hvftr.cell), gxc_hvftr.data.shape)
#     has_nan = np.isnan(gxc_hvftr.data.data).any()
#     print("Contains NaN? {}".format(has_nan))
# #     break