In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.sparse import issparse

In [None]:
from anndata import read_h5ad

#insert you full path and dataset name here
adata = read_h5ad("mouse_diabetes.h5ad", backed='r')

## For creating files by cell type

In [None]:
cell_types = adata.obs['cell_type'].unique().tolist()
print(f"Found cell types: {cell_types}")

In [None]:
#for cell_type in cell_types:
    print(f"Processing cell type: {cell_type}")
    
    # Get indices of cells with this cell type
    indices = adata.obs[adata.obs['cell_type'] == cell_type].index
    
    # Subset the data (load into memory)
    adata_subset = adata[indices, :].to_memory()
    
    # Save as separate .h5ad
    filename = f"{cell_type}_subset.h5ad"
    adata_subset.write(filename)
    print(f"Saved {filename}")

## For creating files my disease AND cell type

In [None]:
diseases = adata.obs['disease'].unique().tolist()
print(f"Diseases: {diseases}")

In [None]:
#for cell_type in cell_types:
    for disease in diseases:
        print(f"Processing {cell_type} in {disease}")
        
        # Get indices for this combination
        mask = (adata.obs['cell_type'] == cell_type) & (adata.obs['disease'] == disease)
        indices = adata.obs[mask].index
        
        if len(indices) == 0:
            print(f"⚠️ No cells found for {cell_type} in {disease}")
            continue
        
        # Subset data and load into memory
        adata_subset = adata[indices, :].to_memory()
        
        # Save to a file
        filename = f"{cell_type}_{disease}_subset.h5ad"
        adata_subset.write(filename)
        print(f"✅ Saved {filename} with {adata_subset.n_obs} cells")