In [1]:
import scanpy as sc

In [None]:
import scanpy as sc

def subsample_anndata_by_category(adata, category_column, max_cells_per_category):
    """
    Subsample an AnnData object by retaining at most a user-defined number 
    of cells for each category in the specified obs column.
    """
    import numpy as np
    cells_to_keep = []
    # Loop through each category and randomly subsample if needed
    for category, group in adata.obs.groupby(category_column):
        if len(group) > max_cells_per_category:
            # Randomly sample
            sampled_cells = group.sample(n=max_cells_per_category, random_state=42).index
        else:
            sampled_cells = group.index
        cells_to_keep.extend(sampled_cells)
    
    # Subset in memory
    adata_subsampled = adata[cells_to_keep, :].copy()
    return adata_subsampled


if __name__ == "__main__":
    # Load dataset in backed mode (read-only, minimal memory usage).
    adata_backed = sc.read_h5ad(
        "E:/All_users/Giacomo/straital_graft/data/Adult_Brain_Neurons.h5ad",
        backed='r'
    )

    # Create a single combined mask for all filtering criteria.
    values_to_remove = ['leukocyte']
    values_to_retain = ['29-year-old stage']

    mask = (
        (~adata_backed.obs['cell_type'].isin(values_to_remove)) & 
         (adata_backed.obs['development_stage'].isin(values_to_retain))
    )

    # Write the filtered subset to a NEW .h5ad file on disk.
    subset_filename = "E:/All_users/Giacomo/straital_graft/data/Adult_Brain_Neurons_FILTERED.h5ad"
    adata_backed[mask, :].copy(filename=subset_filename)

    # Read that filtered subset back in memory
    adata_filtered = sc.read_h5ad(subset_filename)

    #  Perform subsampling (in memory)
    category_column = 'supercluster_term'  # or another column
    max_cells_per_category = 8000
    adata_subsampled = subsample_anndata_by_category(
        adata_filtered, category_column, max_cells_per_category
    )

    # Save your final subsampled dataset
    subsampled_filename = "E:/All_users/Giacomo/straital_graft/data/Adult_Brain_Neurons_29yo.h5ad"
    adata_subsampled.write(subsampled_filename)
    print(f"Subsampling complete and saved as '{subsampled_filename}'")

In [3]:
adata=sc.read_h5ad("E:/All_users/Giacomo/straital_graft/data/Adult_Brain_Neurons_29yo.h5ad")

In [4]:
adata

AnnData object with n_obs × n_vars = 152164 × 59236
    obs: 'ROIGroup', 'ROIGroupCoarse', 'ROIGroupFine', 'roi', 'organism_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'suspension_type', 'dissection', 'fraction_mitochondrial', 'fraction_unspliced', 'cell_cycle_score', 'total_genes', 'total_UMIs', 'sample_id', 'supercluster_term', 'cluster_id', 'subcluster_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'Biotype', 'Chromosome', 'End', 'Gene', 'Start', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'batch_condition', 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X

In [5]:
adata.obs['supercluster_term'].value_counts()

supercluster_term
Amygdala excitatory                  8000
Hippocampal dentate gyrus            8000
Upper rhombic lip                    8000
Thalamic excitatory                  8000
Splatter                             8000
Midbrain-derived inhibitory          8000
Medium spiny neuron                  8000
MGE interneuron                      8000
CGE interneuron                      8000
LAMP5-LHX6 and Chandelier            8000
Hippocampal CA1-3                    8000
Eccentric medium spiny neuron        8000
Deep-layer intratelencephalic        8000
Deep-layer corticothalamic and 6b    8000
Upper-layer intratelencephalic       8000
Lower rhombic lip                    6917
Miscellaneous                        6687
Deep-layer near-projecting           5563
Hippocampal CA4                      5374
Cerebellar inhibitory                4968
Mammillary body                      2655
Name: count, dtype: int64

In [6]:
adata.obs['development_stage'].value_counts()

development_stage
29-year-old stage    152164
Name: count, dtype: int64