In [1]:
import cellxgene_census
import numpy as np
import tiledbsoma as soma

In [2]:


# Open Census and get mouse data
with cellxgene_census.open_soma() as census:
    mouse = census["census_data"]["mus_musculus"]
    
    # Define your query filters here
    with mouse.axis_query(
        measurement_name="RNA",
        obs_query=soma.AxisQuery(value_filter="is_primary_data==True"),
        # Add any other filters you need
    ) as query:
        # Get variable information
        var_df = query.var().concat().to_pandas()
        n_vars = len(var_df)
        n_obs = query.n_obs
        
        # Initialize accumulators for per-gene means
        gene_sum = np.zeros((n_vars,), dtype=np.float64)
        gene_count = np.zeros((n_vars,), dtype=np.int64)
        
        # Get indexer to map soma_joinid to positional indices
        indexer = query.indexer
        
        # Stream through X data in batches
        for chunk_idx, arrow_tbl in enumerate(query.X("raw").tables()):
            print(f"Processing chunk {chunk_idx + 1}...")
            
            # Get positional indices for genes (var dimension)
            var_pos = indexer.by_var(arrow_tbl["soma_dim_1"])
            # Get the data values
            data = arrow_tbl["soma_data"].to_numpy()
            
            # Accumulate sums and counts per gene
            np.add.at(gene_sum, var_pos, data)
            np.add.at(gene_count, var_pos, 1)
            
            print(f"  Chunk {chunk_idx + 1} complete: {len(data)} values processed")
        
        # Compute final means
        gene_mean = np.divide(
            gene_sum, 
            n_obs,
            where=(gene_count > 0),
            out=np.zeros_like(gene_sum)
        )
        
        print(f"Processed {n_obs} cells across {n_vars} genes")
        print(f"Gene means computed: {gene_mean[:10]}")  # Show first 10

The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.


Processing chunk 1...
  Chunk 1 complete: 134217728 values processed
Processing chunk 2...
  Chunk 2 complete: 134217728 values processed
Processing chunk 3...
  Chunk 3 complete: 134217728 values processed
Processing chunk 4...
  Chunk 4 complete: 134217728 values processed
Processing chunk 5...
  Chunk 5 complete: 134217728 values processed
Processing chunk 6...
  Chunk 6 complete: 134217728 values processed
Processing chunk 7...
  Chunk 7 complete: 134217728 values processed
Processing chunk 8...
  Chunk 8 complete: 134217728 values processed
Processing chunk 9...
  Chunk 9 complete: 134217728 values processed
Processing chunk 10...
  Chunk 10 complete: 134217728 values processed
Processing chunk 11...
  Chunk 11 complete: 134217728 values processed
Processing chunk 12...
  Chunk 12 complete: 134217728 values processed
Processing chunk 13...
  Chunk 13 complete: 134217728 values processed
Processing chunk 14...
  Chunk 14 complete: 134217728 values processed
Processing chunk 15...
 

KeyboardInterrupt: 

In [30]:
test = census["census_data"]["mus_musculus"].obs['raw_mean_nnz']
test

TypeError: 'DataFrame' object is not subscriptable

In [12]:
import cellxgene_census
census = cellxgene_census.open_soma()
cell_meta_data = cellxgene_census.get_obs(
    census, "mus_musculus", column_names=["cell_type"]
)
cell_meta_data = cell_meta_data['cell_type']



The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.


In [15]:
import pandas as pd
pd.set_option('display.max_rows', None)
set(cell_meta_data)

{'B cell',
 'B cell zone reticular cell',
 'Bergmann glial cell',
 'CD103-positive dendritic cell',
 'CD141-positive myeloid dendritic cell',
 'CD1c-positive myeloid dendritic cell',
 'CD4-positive, alpha-beta T cell',
 'CD4-positive, alpha-beta memory T cell',
 'CD4-positive, alpha-beta thymocyte',
 'CD8-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta thymocyte',
 'CD8_alpha-positive CD11b-negative dendritic cell',
 'Cajal-Retzius cell',
 'DN3 thymocyte',
 'DN4 thymocyte',
 'GABAergic amacrine cell',
 'GABAergic neuron',
 'IgM plasmablast',
 'Kupffer cell',
 'L2/3 intratelencephalic projecting glutamatergic neuron',
 'L2/3 intratelencephalic projecting glutamatergic neuron of the primary motor cortex',
 'L2/3-6 intratelencephalic projecting glutamatergic neuron',
 'L4/5 intratelencephalic projecting glutamatergic neuron',
 'L4/5 intratelencephalic projecting glutamatergic neuron of the primary motor cortex',
 'L5 extratelencephalic projecting glutamatergic cortical neuron',
 