In [4]:
# Import required libraries
import loompy
import numpy as np
import pandas as pd
import gc  # for garbage collection
import os

# Set working directory
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3_dev/Mousebrain')


In [5]:
print(np.__version__)

1.24.3


In [6]:
loom_path = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3_dev/Mousebrain/DATA/dev_all.loom"

In [7]:
# Verify file exists
print(f"File exists: {os.path.exists(loom_path)}")
print(f"File size: {os.path.getsize(loom_path) / (1024**3):.2f} GB")

File exists: True
File size: 9.56 GB


In [8]:
# Connect to the loom file (this doesn't load it entirely into memory)
with loompy.connect(loom_path) as ds:
    # Basic information about the dataset
    print("\nDataset shape:", ds.shape)
    print(f"Number of cells: {ds.shape[1]:,}")
    print(f"Number of genes: {ds.shape[0]:,}")
    
    # Examine column attributes (cell metadata)
    print("\nColumn attributes (cell metadata):")
    for attr in ds.ca.keys():
        print(f"- {attr}: {ds.ca[attr].dtype}")
    
    # Examine row attributes (gene metadata)
    print("\nRow attributes (gene metadata):")
    for attr in ds.ra.keys():
        print(f"- {attr}: {ds.ra[attr].dtype}")
    
    # Get a small sample of the expression matrix (first 5 genes, first 5 cells)
    print("\nSample of expression matrix (5x5):")
    sample_matrix = ds[:5, :5]
    print(sample_matrix)
    
    # Get some basic statistics
    print("\nBasic statistics:")
    print(f"Mean expression: {np.mean(sample_matrix):.4f}")
    print(f"Median expression: {np.median(sample_matrix):.4f}")
    print(f"Sparsity: {(sample_matrix == 0).sum() / sample_matrix.size:.2%}")

# Force garbage collection
gc.collect()


Dataset shape: (31053, 292495)
Number of cells: 292,495
Number of genes: 31,053

Column attributes (cell metadata):
- Age: object
- BTSNE: float64
- CellCycle: float64
- CellID: object
- Cell_Conc: int64
- Chemistry: object
- ChipID: object
- Class: object
- ClusterName: object
- Clusters: int64
- Date_Captured: object
- DonorID: object
- DoubletFinderPCA: float64
- HPF: float64
- HPF_LogPP: float64
- HPF_theta: float64
- IsCycling: uint8
- Label: object
- Location_E9_E11: object
- NCellsCluster: int64
- NGenes: float64
- Num_Pooled_Animals: int64
- PCA: float64
- PCR_Cycles: int64
- Plug_Date: object
- Project: object
- PseudoAge: float64
- PseudoTissue: object
- Region: object
- SampleID: object
- SampleName: object
- Sample_Index: object
- Sex: object
- Species: object
- Split: int64
- Strain: object
- Subclass: object
- TSNE: float64
- Target_Num_Cells: float64
- Tissue: object
- TotalUMI: float64
- Transcriptome: object
- UMAP: float32
- UMAP3D: float32
- cDNA_Lib_Ok: object
- ng

474