In [1]:
# this takes approximately 200GB of RAM due to the count matrix being saved as dense array
import h5py
import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import coo_matrix, hstack, vstack, issparse

def read_brain_atlas_data(filename):
    h5 = h5py.File(filename, 'r')
    print('reading chunks')
    countdata = h5['data']['counts']
    nrows, ncols = countdata.shape
    sparsechunks, sparsecolchunks = [], []
    for rowslice, colslice in countdata.iter_chunks():
        sparsechunk = coo_matrix(
            countdata[rowslice, colslice].T.astype(np.int32),
        )
        sparsechunk.eliminate_zeros()
        sparsechunks.append(
            sparsechunk
        )
        
        if colslice.stop == ncols:
            sparsecolchunks.append(
                vstack(sparsechunks)
            )
            
            del sparsechunks
            sparsechunks = []
    
    print('stack column chunks')
    X = hstack(sparsecolchunks)
    del sparsecolchunks
    
    print('convert to csr matrix')
    X = X.tocsr()
    
    row_col = dict()
    for h5key, dictkey in zip(['gene', 'samples'], ['var', 'obs']):
        print(f'reading {h5key}')
        row_col[dictkey] = pd.DataFrame(
            index = np.char.decode(
                h5['data'][h5key][()],
                encoding = 'utf-8'
            )
        )
    
    print('converting to AnnData')
    return ad.AnnData(X, **row_col)

adata = read_brain_atlas_data('../data/mouse_brain_adult_counts.h5')
adata.obs.index = adata.obs.index.str.replace('10X_cells.', '', regex = True)
metadata = pd.read_csv(
    '../data/mouse_brain_adult_anno.csv',
    index_col = 0
)
metadata.index = metadata.sample_name.str.replace('10X_cells.', '', regex = True).to_list()
metadata.drop(
    columns = [
        'sample_name',
        'exp_component_name'
    ]
)
adata.obs = adata.obs.merge(
    metadata,
    left_index = True,
    right_index = True,
    how = 'left'
)
adata.write_h5ad('../data/mouse_brain_adult.h5ad')

reading chunks
stack column chunks and convert to csr matrix
reading gene
reading samples
converting to AnnData


  mask |= (ar1 == a)
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex_label' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'region_label' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'library_label' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'platform_label' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'joint_region_color' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'joint_region_label' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'region_color' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex_color' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'platform_color' as categorical
  c.reorder_categories(natsorted

In [16]:
# only using primary somatosensory cortex (SSp)
adult = adata[adata.obs.region_label == 'SSp', :].copy()

In [20]:
p4 = sc.read_10x_h5('../data/mouse_brain_P4.h5')
e14_5 = sc.read_10x_h5('../data/mouse_brain_E14_5.h5')
metadata = pd.read_csv(
    '../data/metaData_scDevSC.txt',
    sep = '\t',
    skiprows = [1]
)
metadata.index = metadata.NAME.to_list()
metadata.drop(
    columns = ['NAME'],
    inplace = True
)
metadata

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Unnamed: 0,orig_ident,nCount_RNA,nFeature_RNA,percent_mito,n_hkgene,S_Score,G2M_Score,Phase,CC_Difference,seurat_clusters,...,donor_id,species,disease,disease__ontology_label,organ,organ__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,sex,species__ontology_label
E10_v1_AAACCTGAGGGTCTCC-1,E10,1544,1022,0.020078,51,0.356987,0.330795,S,0.026192,34,...,mouse_E10,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
E10_v1_AAACCTGCACAACGCC-1,E10,1157,783,0.014693,39,0.453854,0.260560,S,0.193294,34,...,mouse_E10,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
E10_v1_AAACCTGCACGACGAA-1,E10,2081,1200,0.016338,67,0.447598,0.218746,S,0.228852,34,...,mouse_E10,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
E10_v1_AAACCTGCAGACAAGC-1,E10,2490,1430,0.021285,71,0.227723,0.307102,G2M,-0.079379,37,...,mouse_E10,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
E10_v1_AAACCTGCAGAGCCAA-1,E10,2514,1416,0.021082,70,0.738406,0.136556,S,0.601850,37,...,mouse_E10,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P4_TTTGTCAGTTGATTGC-1,P4,8028,3130,0.064275,74,-0.225020,-0.244369,G1,0.019349,16,...,mouse_P4,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
P4_TTTGTCATCAACGGCC-1,P4,15768,4425,0.041540,79,-0.219746,-0.236435,G1,0.016689,24,...,mouse_P4,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
P4_TTTGTCATCCGTTGTC-1,P4,21692,4931,0.041398,76,-0.207196,-0.244800,G1,0.037604,16,...,mouse_P4,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
P4_TTTGTCATCGATGAGG-1,P4,3680,1897,0.057880,67,-0.185044,-0.185335,G1,0.000291,16,...,mouse_P4,NCBITaxon_10090,PATO_0000461,normal,UBERON_0008930,somatosensory cortex,EFO_0009899,10X 3' v2 sequencing,mixed,Mus musculus
