In [2]:
# Install necessary packages (if not already installed)
# Uncomment the next line to install
# !pip install anndata numpy scipy

import anndata as ad
import numpy as np
import scipy.sparse as sp

In [12]:
# Load the h5ad file
input_file = "Z:/dmclab/Marta/PD/Combined_snRNAseq/data/seurat_obj_forcellxegene.h5ad"
adata = ad.read_h5ad(input_file)

# Check the observation-level metadata (rows) 
print("Observation-level metadata (adata.obs):")
print(adata.obs.columns)

# Check the variable-level metadata (columns) 
print("\nVariable-level metadata (adata.var):")
print(adata.var.columns)

Observation-level metadata (adata.obs):
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample_group', 'S.Score',
       'G2M.Score', 'Phase', 'Experiment_ID', 'percent_mt', 'log10GenesPerUMI',
       'pANN_0.25_0.29_1421', 'DF.classifications_0.25_0.29_1421',
       'nCount_SCT', 'nFeature_SCT', 'pANN', 'pANN_0.25_0.3_3439',
       'DF.classifications_0.25_0.3_3439', 'pANN_0.25_0.005_3680',
       'DF.classifications_0.25_0.005_3680', 'pANN_0.25_0.29_1212',
       'DF.classifications_0.25_0.29_1212', 'pANN_0.25_0.3_1151',
       'DF.classifications_0.25_0.3_1151', 'pANN_0.25_0.22_266',
       'DF.classifications_0.25_0.22_266', 'pANN_0.25_0.3_280',
       'DF.classifications_0.25_0.3_280', 'pANN_0.25_0.17_218',
       'DF.classifications_0.25_0.17_218', 'pANN_0.25_0.01_60',
       'DF.classifications_0.25_0.01_60', 'pANN_0.25_0.23_189',
       'DF.classifications_0.25_0.23_189', 'pANN_0.25_0.3_302',
       'DF.classifications_0.25_0.3_302', 'pANN_0.25_0.19_201',
       'DF.classif

In [13]:
# 1. Store matrix data as float32 instead of float64
# This reduces memory usage by half for numerical data
# Convert to float32 if currently float64
if adata.X.dtype == np.float64:
    adata.X = adata.X.astype(np.float32)
    print("Conversion to float32 done.")

Conversion to float32 done.


In [14]:
# 2. Convert metadata columns (obs and var) to 32-bit where possible
# This reduces memory for integer metadata
for col in adata.obs.select_dtypes(include=[np.int64]).columns:
    adata.obs[col] = adata.obs[col].astype(np.int32)
    
for col in adata.var.select_dtypes(include=[np.int64]).columns:
    adata.var[col] = adata.var[col].astype(np.int32)

In [15]:
# 3. Convert string columns to categorical
# Categoricals are stored more efficiently than strings
for col in adata.obs.select_dtypes(include=[object]).columns:
    adata.obs[col] = adata.obs[col].astype('category')
    
for col in adata.var.select_dtypes(include=[object]).columns:
    adata.var[col] = adata.var[col].astype('category')

In [16]:
# 5. Ensure X and raw.X are stored as sparse matrices if >50% zeros
# This saves space by efficiently storing sparse data

# Check and convert X
if isinstance(adata.X, np.ndarray) and (np.mean(adata.X == 0) > 0.5):
    adata.X = sp.csr_matrix(adata.X)

# Check and convert raw.X if raw is present
if adata.raw is not None:
    if isinstance(adata.raw.X, np.ndarray) and (np.mean(adata.raw.X == 0) > 0.5):
        adata.raw.X = sp.csr_matrix(adata.raw.X)

In [None]:
# 6. Save the reduced h5ad file using gzip compression
output_file = "Z:/dmclab/Marta/PD/Combined_snRNAseq/data/seurat_obj_forcellxegene_reeduced.h5ad"
adata.write_h5ad(output_file, compression='gzip')

print("Reduced file saved at:", output_file)

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt

# Check and rename the UMAP embedding if it exists
if 'umap_after_harmony' in adata.obsm:
    adata.obsm['X_umap'] = adata.obsm['umap_after_harmony']
    print("UMAP coordinates found and renamed for plotting.")

    # Plot the UMAP, coloring by a metadata column
    sc.pl.umap(adata, color='cell_type')  # Replace 'cell_type' with your preferred column
    plt.show()
else:
    print("UMAP coordinates 'umap_after_harmony' not found.")



UMAP not found. Calculating UMAP...
