this scirpt is written to see what i have inside the h5ad files so that i understadn th estrcture and dont work with wrong data

In [5]:
"""
Fixed Deep H5AD Inspector
==========================
Works with any h5ad structure
"""

import scanpy as sc
import numpy as np
from pathlib import Path

h5ad_file = '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_normal_epithelial_improved.h5ad'

print(f"Loading: {Path(h5ad_file).name}\n")

# Use scanpy - it handles the HDF5 structure properly
adata = sc.read_h5ad(h5ad_file)
adata.obs_names_make_unique()

# ============================================================================
# 1. EXPRESSION MATRIX
# ============================================================================

print("="*80)
print("1. GENE EXPRESSION MATRIX (X)")
print("="*80)
print(f"  Matrix shape: {adata.X.shape} (cells × genes)")
print(f"  Matrix type: {type(adata.X)}")
print(f"  Data dtype: {adata.X.dtype}")
print(f"  Sparsity: {1 - adata.X.nnz / (adata.n_obs * adata.n_vars):.1%}")

# Show sample of actual values
if hasattr(adata.X, 'toarray'):
    sample_data = adata.X[:5, :5].toarray()
else:
    sample_data = adata.X[:5, :5]

print(f"\n  First 5 cells × first 5 genes (actual expression values):")
print(f"  {sample_data}")

# ============================================================================
# 2. CELL METADATA (observations)
# ============================================================================

print(f"\n{'='*80}")
print("2. CELL METADATA (.obs)")
print("="*80)

print(f"  Total cells: {adata.n_obs:,}")
print(f"\n  Cell metadata columns available:")
for col in adata.obs.columns:
    print(f"    • {col}")

print(f"\n  FIRST 10 CELLS - ALL METADATA:")
print(adata.obs.head(10))

# ============================================================================
# 3. CELL TYPES - ACTUAL DATA
# ============================================================================

if 'cell_type' in adata.obs.columns:
    print(f"\n{'='*80}")
    print("3. CELL TYPES - ACTUAL VALUES")
    print("="*80)
    
    print(f"\n  First 20 cell type values:")
    for i, ctype in enumerate(adata.obs['cell_type'].head(20), 1):
        print(f"    {i:2d}. {ctype}")
    
    print(f"\n  Unique cell types and counts:")
    cell_type_counts = adata.obs['cell_type'].value_counts()
    for ctype, count in cell_type_counts.items():
        pct = 100 * count / adata.n_obs
        print(f"    • {ctype:30} {count:>7,} cells ({pct:>5.1f}%)")

# ============================================================================
# 4. SAMPLE NAMES - ACTUAL DATA
# ============================================================================

if 'sample_name' in adata.obs.columns:
    print(f"\n{'='*80}")
    print("4. SAMPLE NAMES - ACTUAL VALUES")
    print("="*80)
    
    print(f"\n  First 20 sample name values:")
    for i, sname in enumerate(adata.obs['sample_name'].head(20), 1):
        print(f"    {i:2d}. {sname}")
    
    print(f"\n  Unique samples and cell counts:")
    sample_counts = adata.obs['sample_name'].value_counts()
    for sample, count in sample_counts.items():
        print(f"    • {sample:30} {count:>7,} cells")

# ============================================================================
# 5. SAMPLE TYPE - ACTUAL DATA
# ============================================================================

if 'sample_type' in adata.obs.columns:
    print(f"\n{'='*80}")
    print("5. SAMPLE TYPES - ACTUAL VALUES")
    print("="*80)
    
    print(f"\n  First 20 sample type values:")
    for i, stype in enumerate(adata.obs['sample_type'].head(20), 1):
        print(f"    {i:2d}. {stype}")
    
    print(f"\n  Unique sample types and counts:")
    sample_type_counts = adata.obs['sample_type'].value_counts()
    for stype, count in sample_type_counts.items():
        pct = 100 * count / adata.n_obs
        print(f"    • {stype:30} {count:>7,} cells ({pct:>5.1f}%)")

# ============================================================================
# 6. MOLECULAR SUBTYPE - ACTUAL DATA
# ============================================================================

if 'molecular_subtype' in adata.obs.columns:
    print(f"\n{'='*80}")
    print("6. MOLECULAR SUBTYPES - ACTUAL VALUES")
    print("="*80)
    
    print(f"\n  First 20 molecular subtype values:")
    for i, mtype in enumerate(adata.obs['molecular_subtype'].head(20), 1):
        print(f"    {i:2d}. {mtype}")
    
    print(f"\n  Unique molecular subtypes and counts:")
    mol_type_counts = adata.obs['molecular_subtype'].value_counts()
    for mtype, count in mol_type_counts.items():
        pct = 100 * count / adata.n_obs
        if count > 0:
            print(f"    • {mtype:30} {count:>7,} cells ({pct:>5.1f}%)")

# ============================================================================
# 7. SCORES - ACTUAL DATA
# ============================================================================

if 'epithelial_score' in adata.obs.columns:
    print(f"\n{'='*80}")
    print("7. EPITHELIAL SCORES - ACTUAL VALUES")
    print("="*80)
    
    print(f"\n  First 20 epithelial score values:")
    for i, score in enumerate(adata.obs['epithelial_score'].head(20), 1):
        print(f"    {i:2d}. {score:.4f}")
    
    print(f"\n  Statistics:")
    print(f"    Min: {adata.obs['epithelial_score'].min():.4f}")
    print(f"    Max: {adata.obs['epithelial_score'].max():.4f}")
    print(f"    Mean: {adata.obs['epithelial_score'].mean():.4f}")
    print(f"    Median: {adata.obs['epithelial_score'].median():.4f}")

if 'immune_score' in adata.obs.columns:
    print(f"\n  IMMUNE SCORES - ACTUAL VALUES")
    
    print(f"\n  First 20 immune score values:")
    for i, score in enumerate(adata.obs['immune_score'].head(20), 1):
        print(f"    {i:2d}. {score:.4f}")
    
    print(f"\n  Statistics:")
    print(f"    Min: {adata.obs['immune_score'].min():.4f}")
    print(f"    Max: {adata.obs['immune_score'].max():.4f}")
    print(f"    Mean: {adata.obs['immune_score'].mean():.4f}")
    print(f"    Median: {adata.obs['immune_score'].median():.4f}")

# ============================================================================
# 8. GENE NAMES - ACTUAL DATA
# ============================================================================

print(f"\n{'='*80}")
print("8. GENE NAMES (.var) - ACTUAL VALUES")
print("="*80)

print(f"  Total genes: {adata.n_vars:,}")
print(f"\n  First 30 gene names:")
for i, gene in enumerate(adata.var_names[:30], 1):
    print(f"    {i:2d}. {gene}")

# ============================================================================
# 9. COMPLETE SUMMARY
# ============================================================================

print(f"\n{'='*80}")
print("9. COMPLETE DATASET SUMMARY")
print("="*80)

print(f"\n  DIMENSIONS:")
print(f"    Cells (n_obs): {adata.n_obs:,}")
print(f"    Genes (n_vars): {adata.n_vars:,}")

print(f"\n  CELL METADATA AVAILABLE:")
for col in adata.obs.columns:
    print(f"    ✓ {col}")

print(f"\n  SAMPLES:")
if 'sample_name' in adata.obs.columns:
    print(f"    Total unique samples: {adata.obs['sample_name'].nunique()}")

print(f"\n  DATA QUALITY:")
if hasattr(adata.X, 'toarray'):
    umis = adata.X.toarray().sum(axis=1)
else:
    umis = np.asarray(adata.X.sum(axis=1)).flatten()

genes_per_cell = (adata.X > 0).sum(axis=1)
if hasattr(genes_per_cell, 'A1'):
    genes_per_cell = genes_per_cell.A1
genes_per_cell = np.asarray(genes_per_cell).flatten()

print(f"    UMIs/cell: {umis.mean():.1f} ± {umis.std():.1f}")
print(f"    Genes/cell: {genes_per_cell.mean():.1f} ± {genes_per_cell.std():.1f}")

print(f"\n  STATUS:")
print(f"    ✓ All {adata.n_obs:,} cells are epithelial")
print(f"    ✓ Ready for gene filtering")
print(f"    ✓ Ready for co-expression analysis")

print(f"\n{'='*80}\n")

Loading: adata_normal_epithelial_improved.h5ad



  utils.warn_names_duplicates("obs")


1. GENE EXPRESSION MATRIX (X)
  Matrix shape: (83522, 33514) (cells × genes)
  Matrix type: <class 'scipy.sparse._csr.csr_matrix'>
  Data dtype: int64
  Sparsity: 94.0%

  First 5 cells × first 5 genes (actual expression values):
  [[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]

2. CELL METADATA (.obs)
  Total cells: 83,522

  Cell metadata columns available:
    • barcode
    • sample_name
    • sample_type
    • geo_id
    • cell_type
    • epithelial_score
    • immune_score
    • molecular_subtype

  FIRST 10 CELLS - ALL METADATA:
               barcode   sample_name sample_type      geo_id   cell_type  \
0   AAACCTGAGACTAGGC-1  N-0092-total      Normal  GSM4909253  Epithelial   
1   AAACCTGAGGACAGAA-1  N-0092-total      Normal  GSM4909253  Epithelial   
2   AAACCTGAGGATGTAT-1  N-0092-total      Normal  GSM4909253  Epithelial   
3   AAACCTGCAAGAGGCT-1  N-0092-total      Normal  GSM4909253  Epithelial   
7   AAACGGGAGATGGCGT-1  N-0092-total      Normal  GSM4909253

In [6]:
"""
Loop Inspector: Check All H5AD Files
=====================================
Inspect all 7 datasets in one go
"""

import scanpy as sc
import numpy as np
from pathlib import Path

# List of all datasets
h5ad_files = [
    '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_normal_epithelial_improved.h5ad',
    '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_er_positive_epithelial_improved.h5ad',
    '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_her2_positive_epithelial_improved.h5ad',
    '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_triplenegative_epithelial_improved.h5ad',
    '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_triplenegative_brca1_epithelial_improved.h5ad',
    '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_brca1_preneoplastic_epithelial_improved.h5ad',
    '/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_test_annotated_checkpoint.h5ad',
]

# ============================================================================
# LOOP THROUGH ALL FILES
# ============================================================================

summary_data = []

for h5ad_file in h5ad_files:
    filepath = Path(h5ad_file)
    
    # Check if file exists
    if not filepath.exists():
        print(f"  NOT FOUND: {filepath.name}\n")
        continue
    
    print(f"\n{'='*80}")
    print(f"FILE: {filepath.name}")
    print(f"{'='*80}")
    
    try:
        # Load file
        adata = sc.read_h5ad(h5ad_file)
        adata.obs_names_make_unique()
        
        # ====================================================================
        # 1. BASIC INFO
        # ====================================================================
        
        print(f"\n1. DIMENSIONS")
        print(f"  Cells: {adata.n_obs:,}")
        print(f"  Genes: {adata.n_vars:,}")
        print(f"  Sparsity: {1 - adata.X.nnz / (adata.n_obs * adata.n_vars):.1%}")
        
        # ====================================================================
        # 2. CELL TYPES
        # ====================================================================
        
        if 'cell_type' in adata.obs.columns:
            print(f"\n2. CELL TYPES")
            cell_types = adata.obs['cell_type'].value_counts()
            for ctype, count in cell_types.items():
                pct = 100 * count / adata.n_obs
                print(f"  • {ctype:25} {count:>7,} cells ({pct:>5.1f}%)")
        
        # ====================================================================
        # 3. SAMPLES
        # ====================================================================
        
        if 'sample_name' in adata.obs.columns:
            n_samples = adata.obs['sample_name'].nunique()
            print(f"\n3. SAMPLES")
            print(f"  Unique samples: {n_samples}")
            sample_counts = adata.obs['sample_name'].value_counts()
            print(f"  Top 5 samples by cell count:")
            for sample, count in sample_counts.head(5).items():
                print(f"    • {sample:30} {count:>7,} cells")
        
        # ====================================================================
        # 4. SAMPLE TYPE
        # ====================================================================
        
        if 'sample_type' in adata.obs.columns:
            print(f"\n4. SAMPLE TYPE")
            sample_types = adata.obs['sample_type'].value_counts()
            for stype, count in sample_types.items():
                pct = 100 * count / adata.n_obs
                print(f"  • {stype:30} {count:>7,} cells ({pct:>5.1f}%)")
        
        # ====================================================================
        # 5. MOLECULAR SUBTYPES
        # ====================================================================
        
        if 'molecular_subtype' in adata.obs.columns:
            print(f"\n5. MOLECULAR SUBTYPES")
            mol_types = adata.obs['molecular_subtype'].value_counts()
            for mtype, count in mol_types.items():
                pct = 100 * count / adata.n_obs
                print(f"  • {mtype:25} {count:>7,} cells ({pct:>5.1f}%)")
        
        # ====================================================================
        # 6. METADATA COLUMNS
        # ====================================================================
        
        print(f"\n6. METADATA COLUMNS AVAILABLE")
        for col in adata.obs.columns:
            print(f"  ✓ {col}")
        
        # ====================================================================
        # 7. DATA QUALITY
        # ====================================================================
        
        if hasattr(adata.X, 'toarray'):
            umis = adata.X.toarray().sum(axis=1)
        else:
            umis = np.asarray(adata.X.sum(axis=1)).flatten()
        
        genes_per_cell = (adata.X > 0).sum(axis=1)
        if hasattr(genes_per_cell, 'A1'):
            genes_per_cell = genes_per_cell.A1
        genes_per_cell = np.asarray(genes_per_cell).flatten()
        
        print(f"\n7. DATA QUALITY")
        print(f"  UMIs/cell (mean ± std): {umis.mean():.1f} ± {umis.std():.1f}")
        print(f"  Genes/cell (mean ± std): {genes_per_cell.mean():.1f} ± {genes_per_cell.std():.1f}")
        
        # Store summary
        summary_data.append({
            'File': filepath.name,
            'Cells': adata.n_obs,
            'Genes': adata.n_vars,
            'Samples': adata.obs['sample_name'].nunique() if 'sample_name' in adata.obs.columns else 0,
            'UMIs/cell': f"{umis.mean():.0f}",
            'Genes/cell': f"{genes_per_cell.mean():.0f}",
            'Status': '✓ OK'
        })
        
    except Exception as e:
        print(f"  ✗ ERROR: {str(e)}")
        summary_data.append({
            'File': filepath.name,
            'Cells': 'ERROR',
            'Genes': 'ERROR',
            'Samples': 'ERROR',
            'UMIs/cell': 'ERROR',
            'Genes/cell': 'ERROR',
            'Status': f'✗ {str(e)[:30]}'
        })

# ============================================================================
# SUMMARY TABLE
# ============================================================================

print(f"\n\n{'='*80}")
print("SUMMARY TABLE - ALL DATASETS")
print(f"{'='*80}\n")

import pandas as pd
summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

print(f"\n{'='*80}\n")


FILE: adata_normal_epithelial_improved.h5ad


  utils.warn_names_duplicates("obs")



1. DIMENSIONS
  Cells: 83,522
  Genes: 33,514
  Sparsity: 94.0%

2. CELL TYPES
  • Epithelial                 83,522 cells (100.0%)

3. SAMPLES
  Unique samples: 24
  Top 5 samples by cell count:
    • N-0372-epi                      11,336 cells
    • N-0093-epi                       8,852 cells
    • N-0342-epi                       7,148 cells
    • N-0275-epi                       5,917 cells
    • N-0342-total                     5,016 cells

4. SAMPLE TYPE
  • Normal                          83,522 cells (100.0%)

5. MOLECULAR SUBTYPES
  • Normal                     40,059 cells ( 48.0%)
  • Her2                       32,674 cells ( 39.1%)
  • Basal                       4,917 cells (  5.9%)
  • LumB                        3,260 cells (  3.9%)
  • LumA                        2,612 cells (  3.1%)

6. METADATA COLUMNS AVAILABLE
  ✓ barcode
  ✓ sample_name
  ✓ sample_type
  ✓ geo_id
  ✓ cell_type
  ✓ epithelial_score
  ✓ immune_score
  ✓ molecular_subtype

7. DATA QUALITY
  UMIs/ce

  utils.warn_names_duplicates("obs")



1. DIMENSIONS
  Cells: 91,908
  Genes: 33,514
  Sparsity: 95.9%

2. CELL TYPES
  • Epithelial                 91,908 cells (100.0%)

3. SAMPLES
  Unique samples: 27
  Top 5 samples by cell count:
    • mER-0178                        10,259 cells
    • ER-0056-LN                       6,852 cells
    • ER-0029-9C                       6,698 cells
    • ER-0167-T                        6,695 cells
    • ER-0163                          5,798 cells

4. SAMPLE TYPE
  • ER_Positive                     91,908 cells (100.0%)

5. MOLECULAR SUBTYPES
  • Basal                      30,539 cells ( 33.2%)
  • LumA                       22,327 cells ( 24.3%)
  • Her2                       15,606 cells ( 17.0%)
  • LumB                       14,802 cells ( 16.1%)
  • Normal                      8,634 cells (  9.4%)

6. METADATA COLUMNS AVAILABLE
  ✓ barcode
  ✓ sample_name
  ✓ sample_type
  ✓ geo_id
  ✓ cell_type
  ✓ epithelial_score
  ✓ immune_score
  ✓ molecular_subtype

7. DATA QUALITY
  UMIs/ce

  utils.warn_names_duplicates("obs")



1. DIMENSIONS
  Cells: 19,693
  Genes: 33,514
  Sparsity: 93.2%

2. CELL TYPES
  • Epithelial                 19,693 cells (100.0%)

3. SAMPLES
  Unique samples: 6
  Top 5 samples by cell count:
    • HER2-0176                        6,353 cells
    • HER2-0337                        4,691 cells
    • HER2-0161                        3,941 cells
    • HER2-0308                        3,578 cells
    • HER2-0031                          895 cells

4. SAMPLE TYPE
  • HER2_Positive                   19,693 cells (100.0%)

5. MOLECULAR SUBTYPES
  • Her2                       15,511 cells ( 78.8%)
  • Basal                       2,062 cells ( 10.5%)
  • LumA                          772 cells (  3.9%)
  • Normal                        717 cells (  3.6%)
  • LumB                          631 cells (  3.2%)

6. METADATA COLUMNS AVAILABLE
  ✓ barcode
  ✓ sample_name
  ✓ sample_type
  ✓ geo_id
  ✓ cell_type
  ✓ epithelial_score
  ✓ immune_score
  ✓ molecular_subtype

7. DATA QUALITY
  UMIs/cel

  utils.warn_names_duplicates("obs")



1. DIMENSIONS
  Cells: 7,561
  Genes: 33,514
  Sparsity: 93.5%

2. CELL TYPES
  • Epithelial                  7,561 cells (100.0%)

3. SAMPLES
  Unique samples: 4
  Top 5 samples by cell count:
    • TN-0135                          6,111 cells
    • TN-0126                          1,167 cells
    • TN-0114-T2                         227 cells
    • TN-0106                             56 cells

4. SAMPLE TYPE
  • TripleNegative                   7,561 cells (100.0%)

5. MOLECULAR SUBTYPES
  • Normal                      6,898 cells ( 91.2%)
  • Her2                          233 cells (  3.1%)
  • LumA                          167 cells (  2.2%)
  • LumB                          142 cells (  1.9%)
  • Basal                         121 cells (  1.6%)

6. METADATA COLUMNS AVAILABLE
  ✓ barcode
  ✓ sample_name
  ✓ sample_type
  ✓ geo_id
  ✓ cell_type
  ✓ epithelial_score
  ✓ immune_score
  ✓ molecular_subtype

7. DATA QUALITY
  UMIs/cell (mean ± std): 7570.7 ± 8563.8
  Genes/cell (mean ±

  utils.warn_names_duplicates("obs")



1. DIMENSIONS
  Cells: 14,186
  Genes: 33,514
  Sparsity: 92.8%

2. CELL TYPES
  • Epithelial                 14,186 cells (100.0%)

3. SAMPLES
  Unique samples: 4
  Top 5 samples by cell count:
    • TN-B1-0131                       5,982 cells
    • TN-B1-4031                       5,318 cells
    • TN-B1-0554                       1,566 cells
    • TN-B1-0177                       1,320 cells

4. SAMPLE TYPE
  • TripleNegative_BRCA1            14,186 cells (100.0%)

5. MOLECULAR SUBTYPES
  • Normal                      7,077 cells ( 49.9%)
  • Her2                        4,952 cells ( 34.9%)
  • LumA                          880 cells (  6.2%)
  • Basal                         811 cells (  5.7%)
  • LumB                          466 cells (  3.3%)

6. METADATA COLUMNS AVAILABLE
  ✓ barcode
  ✓ sample_name
  ✓ sample_type
  ✓ geo_id
  ✓ cell_type
  ✓ epithelial_score
  ✓ immune_score
  ✓ molecular_subtype

7. DATA QUALITY
  UMIs/cell (mean ± std): 12324.6 ± 10090.5
  Genes/cell (mea

  utils.warn_names_duplicates("obs")



1. DIMENSIONS
  Cells: 7,644
  Genes: 33,514
  Sparsity: 95.5%

2. CELL TYPES
  • Epithelial                  7,644 cells (100.0%)

3. SAMPLES
  Unique samples: 4
  Top 5 samples by cell count:
    • B1-0894                          3,513 cells
    • B1-0023                          2,096 cells
    • B1-0033                          1,846 cells
    • B1-0090                            189 cells

4. SAMPLE TYPE
  • BRCA1_PreNeoplastic              7,644 cells (100.0%)

5. MOLECULAR SUBTYPES
  • Normal                      3,598 cells ( 47.1%)
  • Her2                        3,035 cells ( 39.7%)
  • Basal                         495 cells (  6.5%)
  • LumB                          305 cells (  4.0%)
  • LumA                          211 cells (  2.8%)

6. METADATA COLUMNS AVAILABLE
  ✓ barcode
  ✓ sample_name
  ✓ sample_type
  ✓ geo_id
  ✓ cell_type
  ✓ epithelial_score
  ✓ immune_score
  ✓ molecular_subtype

7. DATA QUALITY
  UMIs/cell (mean ± std): 6133.2 ± 7256.3
  Genes/cell (mean ±

In [5]:
import scanpy as sc
import numpy as np
import pandas as pd
from pathlib import Path


# Load one dataset to inspect
normal_path = Path('/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_triplenegative_epithelial_improved.h5ad')
adata = sc.read_h5ad(normal_path)
adata.obs_names_make_unique()


print('\n' + '='*100)
print('DATA STRUCTURE INSPECTOR - Understanding Your AnnData Object')
print('='*100 + '\n')


# ============================================================================
# 1. OVERALL STRUCTURE
# ============================================================================

print('1. OVERALL STRUCTURE')
print('-'*100 + '\n')

print(f'AnnData object shape: {adata.X.shape}')
print(f'  Rows (axis 0):    {adata.n_obs:,} cells')
print(f'  Columns (axis 1): {adata.n_vars:,} genes')
print('\nVisualization:')
"""print('''
        Genes (COLUMNS) →  [Gene1, Gene2, Gene3, ..., Gene33514]
    ↓
    C
    e   ┌────────────────────────────────────┐
    l   │  UMI counts (expression matrix)    │
    l   │                                    │
    s   │  83,522 × 33,514 values            │
    (   │  (mostly zeros = sparse!)          │
    R   │                                    │
    O   └────────────────────────────────────┘
    W
    S
    )
    
    Each cell has counts for each gene
    Most cells express most genes at ZERO (sparse)
''')"""


# ============================================================================
# 2. ROWS = CELLS
# ============================================================================

print('\n2. ROWS = CELLS (observations)')
print('-'*100 + '\n')

print('Row names (adata.obs_names) = Cell barcodes/IDs')
print(f'  Total rows: {adata.n_obs:,}')
print('  First 10 cell IDs:\n')

for i, cell_id in enumerate(adata.obs_names[:10], 1):
    print(f'    {i:2d}. {cell_id}')

print(f'\n  ... ({adata.n_obs - 20:,} more cells) ...\n')

for i, cell_id in enumerate(adata.obs_names[-5:], adata.n_obs - 4):
    print(f'    {i:2d}. {cell_id}')


# ============================================================================
# 3. COLUMNS = GENES
# ============================================================================

print('\n\n3. COLUMNS = GENES (variables)')
print('-'*100 + '\n')

print('Column names (adata.var_names) = Gene symbols')
print(f'  Total columns: {adata.n_vars:,}')
print('  First 20 genes:\n')

for i, gene in enumerate(adata.var_names[:20], 1):
    print(f'    {i:2d}. {gene}')

print(f'\n  ... ({adata.n_vars - 40:,} more genes) ...\n')

for i, gene in enumerate(adata.var_names[-20:], adata.n_vars - 19):
    print(f'    {i:2d}. {gene}')


# ============================================================================
# 4. CELL METADATA (obs)
# ============================================================================

print('\n\n4. CELL METADATA (adata.obs = ROWS metadata)')
print('-'*100 + '\n')

print('Cell-level information stored in ROWS:')
print(f'  Columns in adata.obs: {adata.obs.shape[1]}')
print(f'  Shape: {adata.obs.shape[0]:,} cells × {adata.obs.shape[1]} attributes\n')

print('Available cell attributes (metadata columns):\n')
for col in adata.obs.columns:
    n_unique = adata.obs[col].nunique()
    dtype = adata.obs[col].dtype
    print(f'  • {col:25} dtype={str(dtype):15} unique values: {n_unique:>5}')

print('\nExample: First 5 cells with their metadata:')
print(adata.obs.iloc[:5])


# ============================================================================
# 5. GENE METADATA (var)
# ============================================================================

print('\n\n5. GENE METADATA (adata.var = COLUMNS metadata)')
print('-'*100 + '\n')

print('Gene-level information stored in COLUMNS:')
print(f'  Rows in adata.var: {adata.var.shape[0]:,}')
print(f'  Columns: {adata.var.shape[1]}')
print(f'  Shape: {adata.var.shape[0]:,} genes × {adata.var.shape[1]} attributes\n')

if adata.var.shape[1] > 0:
    print('Available gene attributes:\n')
    for col in adata.var.columns:
        print(f'  • {col}')
    
    print('\nExample: First 10 genes with their metadata:')
    print(adata.var.head(10))
else:
    print('(No additional gene metadata)')


# ============================================================================
# 6. EXPRESSION MATRIX EXAMPLE
# ============================================================================

print('\n\n6. EXPRESSION MATRIX - VISUAL EXAMPLE')
print('-'*100 + '\n')

# Get a small subset to show
subset_cells = adata.obs_names[:5]
subset_genes = adata.var_names[:8]
adata_subset = adata[subset_cells, subset_genes].copy()

if hasattr(adata_subset.X, 'toarray'):
    expr_df = pd.DataFrame(adata_subset.X.toarray(), 
                           index=adata_subset.obs_names, 
                           columns=adata_subset.var_names)
else:
    expr_df = pd.DataFrame(adata_subset.X, 
                           index=adata_subset.obs_names, 
                           columns=adata_subset.var_names)

print('Small example: First 5 cells × First 8 genes\n')
print(expr_df.to_string())

print('\n\nInterpretation:')
print('  ROWS = cells (your 5 example cells)')
print('  COLUMNS = genes (your 8 example genes)')
print('  VALUES = UMI counts (gene expression in that cell)')
print('  Many zeros = sparse matrix (most cells don\'t express most genes)')


# ============================================================================
# 7. WHAT YOU'RE ANALYZING
# ============================================================================

print('\n\n7. YOUR ANALYSIS CONTEXT')
print('-'*100 + '\n')

"""print('When you do EDA, you\'re analyzing:')
print('''
✓ CELLS as "samples"
  - Each cell is one biological observation
  - Total ~83k cells in Normal epithelial
  
✓ GENES as "features"
  - Each gene is one measurement/feature
  - Total ~33k genes
  
✓ Expression values
  - How much each gene is expressed in each cell
  - Stored in adata.X (the expression matrix)
  
✓ Cell metadata (adata.obs)
  - What sample each cell comes from
  - Cell type, quality metrics, etc.
  
✓ Gene metadata (adata.var)
  - Gene symbols, annotations, etc.
''')"""


print('\n\n8. HOW THIS RELATES TO SmallSeqFlow')
print('-'*100 + '\n')

print('''
SmallSeqFlow expects: genes × samples (rows = genes, columns = samples)
Your data has:       cells × genes (rows = cells, columns = genes)

Mapping:
  Their "samples" = Your "cells" (treating each cell as a biological replicate)
  Their "genes" = Your "genes" (same concept)

So when you run EDA:
  - Panel 1 histogram: Distribution of total UMIs ACROSS CELLS
  - Panel 2 boxplot: Distribution of expression ACROSS GENES
  - Panel 3 heatmap: Correlations BETWEEN GENES (computed from cells)
  - Panel 4 PCA: Gene space (which genes are similar)
  - Dendrogram: Cell clustering (which cells are similar)
''')

print('='*100 + '\n')


DATA STRUCTURE INSPECTOR - Understanding Your AnnData Object

1. OVERALL STRUCTURE
----------------------------------------------------------------------------------------------------

AnnData object shape: (7561, 33514)
  Rows (axis 0):    7,561 cells
  Columns (axis 1): 33,514 genes

Visualization:

2. ROWS = CELLS (observations)
----------------------------------------------------------------------------------------------------

Row names (adata.obs_names) = Cell barcodes/IDs
  Total rows: 7,561
  First 10 cell IDs:

     1. 0
     2. 2
     3. 4
     4. 9
     5. 15
     6. 20
     7. 23
     8. 26
     9. 30
    10. 34

  ... (7,541 more cells) ...

    7557. 1976
    7558. 1989-1
    7559. 1990-1
    7560. 2003-1
    7561. 2010-1


3. COLUMNS = GENES (variables)
----------------------------------------------------------------------------------------------------

Column names (adata.var_names) = Gene symbols
  Total columns: 33,514
  First 20 genes:

     1. MIR1302-2HG
     2. F

  utils.warn_names_duplicates("obs")


In [3]:
"""
Convert h5ad to h5 format
=========================

Saves to: /triumvirate/home/alexarol/breast_cancer_analysis/results/
"""

import scanpy as sc
import h5py
from pathlib import Path

# ============================================================================
# Convert h5ad to h5
# ============================================================================

print("Converting h5ad to h5...\n")

input_file = Path('/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_normal_epithelial_improved.h5ad')
output_dir = Path('/triumvirate/home/alexarol/breast_cancer_analysis/results')
output_file = output_dir / 'adata_normal_epithelial_improved.h5'

print(f"Input:  {input_file.name}")
print(f"Output: {output_file.name}")
print(f"Save location: {output_dir}\n")

# Load the h5ad file
print("Loading h5ad file...")
adata = sc.read_h5ad(input_file)
adata.obs_names_make_unique()

print(f"Loaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes\n")

# Create HDF5 file
print("Creating HDF5 file...")
with h5py.File(output_file, 'w') as f:
    
    # Store expression matrix
    if hasattr(adata.X, 'toarray'):
        expr_data = adata.X.toarray()
    else:
        expr_data = adata.X
    
    print(f"  Writing expression matrix ({expr_data.shape[0]:,} × {expr_data.shape[1]:,})...")
    f.create_dataset('X', data=expr_data, compression='gzip', compression_opts=4)
    
    # Store cell names (observations)
    print(f"  Writing cell names ({len(adata.obs_names):,} cells)...")
    f.create_dataset('cell_names', data=adata.obs_names.values.astype('S'), compression='gzip')
    
    # Store gene names (variables)
    print(f"  Writing gene names ({len(adata.var_names):,} genes)...")
    f.create_dataset('gene_names', data=adata.var_names.values.astype('S'), compression='gzip')
    
    # Store cell metadata
    print(f"  Writing cell metadata ({adata.obs.shape[1]} attributes)...")
    obs_group = f.create_group('obs')
    for col in adata.obs.columns:
        obs_data = adata.obs[col].values
        try:
            obs_group.create_dataset(col, data=obs_data, compression='gzip')
        except:
            # For categorical/string data
            obs_group.create_dataset(col, data=obs_data.astype('S'), compression='gzip')
    
    # Store gene metadata (if any)
    if adata.var.shape[1] > 0:
        print(f"  Writing gene metadata ({adata.var.shape[1]} attributes)...")
        var_group = f.create_group('var')
        for col in adata.var.columns:
            var_data = adata.var[col].values
            try:
                var_group.create_dataset(col, data=var_data, compression='gzip')
            except:
                var_group.create_dataset(col, data=var_data.astype('S'), compression='gzip')
    
    # Store metadata
    f.attrs['description'] = 'Normal epithelial cells'
    f.attrs['n_cells'] = adata.n_obs
    f.attrs['n_genes'] = adata.n_vars

print(f"\n✓ Successfully created: {output_file}")
print(f"✓ File size: {output_file.stat().st_size / (1024**3):.2f} GB")

print("\n" + "="*80)
print("HOW TO READ THE H5 FILE BACK")
print("="*80 + "\n")

print("""
# Read it back:
import h5py
import numpy as np

with h5py.File('adata_normal_epithelial_improved.h5', 'r') as f:
    # Get expression matrix
    X = f['X'][:]
    
    # Get cell and gene names
    cell_names = [cell.decode() for cell in f['cell_names'][:]]
    gene_names = [gene.decode() for gene in f['gene_names'][:]]
    
    # Get cell metadata
    obs = {}
    for key in f['obs'].keys():
        obs[key] = f['obs'][key][:]
    
    print(f"Expression matrix shape: {X.shape}")
    print(f"Cell names: {cell_names[:5]}")
    print(f"Gene names: {gene_names[:5]}")
    print(f"Cell metadata columns: {list(obs.keys())}")
""")

print("\n" + "="*80)
print("CONVERSION COMPLETE")
print("="*80 + "\n")

Converting h5ad to h5...

Input:  adata_normal_epithelial_improved.h5ad
Output: adata_normal_epithelial_improved.h5
Save location: /triumvirate/home/alexarol/breast_cancer_analysis/results

Loading h5ad file...


  utils.warn_names_duplicates("obs")


Loaded: 83,522 cells × 33,514 genes

Creating HDF5 file...
  Writing expression matrix (83,522 × 33,514)...
  Writing cell names (83,522 cells)...
  Writing gene names (33,514 genes)...
  Writing cell metadata (8 attributes)...

✓ Successfully created: /triumvirate/home/alexarol/breast_cancer_analysis/results/adata_normal_epithelial_improved.h5
✓ File size: 0.44 GB

HOW TO READ THE H5 FILE BACK


# Read it back:
import h5py
import numpy as np

with h5py.File('adata_normal_epithelial_improved.h5', 'r') as f:
    # Get expression matrix
    X = f['X'][:]
    
    # Get cell and gene names
    cell_names = [cell.decode() for cell in f['cell_names'][:]]
    gene_names = [gene.decode() for gene in f['gene_names'][:]]
    
    # Get cell metadata
    obs = {}
    for key in f['obs'].keys():
        obs[key] = f['obs'][key][:]
    
    print(f"Expression matrix shape: {X.shape}")
    print(f"Cell names: {cell_names[:5]}")
    print(f"Gene names: {gene_names[:5]}")
    print(f"Cell metadata co

In [5]:
import scanpy as sc
import numpy as np
import h5py
from scipy import sparse
from pathlib import Path

print("="*80)
print("BUILDING 10x v3 HDF5 (filtered_feature_bc_matrix.h5 compatible)")
print("="*80)

# ------------------------------------------------------------------
# 1. Load your h5ad
# ------------------------------------------------------------------
base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
tn_path = base / "results" / "adata_triplenegative_epithelial_improved.h5ad"
out_path = base / "results" / "tn_10x_filtered_feature_bc_matrix.h5"

print(f"\n[1/4] Loading h5ad: {tn_path}")
adata = sc.read_h5ad(tn_path)
adata.obs_names_make_unique()
adata.var_names_make_unique()

print(f"✓ Loaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")

# ------------------------------------------------------------------
# 2. Build CSR with rows = genes, cols = cells  (10x convention)
# ------------------------------------------------------------------
print("\n[2/4] Building CSR matrix (rows=genes, cols=cells)...")

X = adata.X
if not sparse.isspmatrix_csc(X) and not sparse.isspmatrix_csr(X):
    # adata.X is often CSR (cells × genes); we want genes × cells
    X = sparse.csr_matrix(X)

# current shape is cells × genes; transpose to genes × cells
X_gc = X.T.tocsc()  # genes × cells in CSC (10x uses column-major; better match)

# For 10x H5, columns = barcodes, so CSC with shape (n_genes, n_cells)
n_genes, n_cells = X_gc.shape
print(f"✓ Matrix shape (genes × cells): {n_genes} × {n_cells}")
print(f"  nnz: {X_gc.nnz:,}")

# ------------------------------------------------------------------
# 3. Prepare feature (gene) and barcode (cell) metadata
# ------------------------------------------------------------------
print("\n[3/4] Preparing feature and barcode metadata...")

gene_ids   = adata.var_names.astype(str).tolist()
gene_names = adata.var_names.astype(str).tolist()  # you could instead use symbols if separate
feature_type_value = "Gene Expression"
genome_value = "GRCh38"  # or "hg38" or whatever reference you used; Scanpy only needs a string

barcodes = adata.obs_names.astype(str).tolist()

# Fixed-length ASCII arrays
max_gid  = max(len(g) for g in gene_ids)
max_gnm  = max(len(g) for g in gene_names)
max_ft   = len(feature_type_value)
max_geno = len(genome_value)
max_bc   = max(len(b) for b in barcodes)

gene_ids_arr   = np.array([g.encode("ascii").ljust(max_gid)   for g in gene_ids],   dtype=f"S{max_gid}")
gene_names_arr = np.array([g.encode("ascii").ljust(max_gnm)   for g in gene_names], dtype=f"S{max_gnm}")
feature_type_arr = np.array([feature_type_value.encode("ascii").ljust(max_ft)] * n_genes, dtype=f"S{max_ft}")
genome_arr       = np.array([genome_value.encode("ascii").ljust(max_geno)] * n_genes,   dtype=f"S{max_geno}")
barcodes_arr   = np.array([b.encode("ascii").ljust(max_bc)   for b in barcodes],    dtype=f"S{max_bc}")

print(f"✓ Features: {n_genes}, Barcodes: {n_cells}")

# ------------------------------------------------------------------
# 4. Write 10x‑style HDF5 under /matrix
# ------------------------------------------------------------------
print("\n[4/4] Writing 10x HDF5 file...")
with h5py.File(out_path, "w") as f:
    mat_grp = f.create_group("matrix")

    # 10x uses CSC (column-major) with columns = barcodes
    # data / indices / indptr correspond to CSC of shape (n_genes, n_cells)
    mat_grp.create_dataset("data",    data=X_gc.data.astype(np.float32),  compression="gzip")
    mat_grp.create_dataset("indices", data=X_gc.indices.astype(np.int32), compression="gzip")
    mat_grp.create_dataset("indptr",  data=X_gc.indptr.astype(np.int32),  compression="gzip")
    mat_grp.create_dataset("shape",   data=np.array([n_genes, n_cells], dtype=np.int64))

    # barcodes
    mat_grp.create_dataset("barcodes", data=barcodes_arr)

    # features subgroup with required datasets
    feat_grp = mat_grp.create_group("features")
    feat_grp.create_dataset("id",           data=gene_ids_arr)
    feat_grp.create_dataset("name",         data=gene_names_arr)
    feat_grp.create_dataset("feature_type", data=feature_type_arr)
    feat_grp.create_dataset("genome",       data=genome_arr)
    # 10x also has _all_tag_keys listing extra fields
    all_keys = np.array([b"genome"], dtype="S6")  # only "genome" as extra tag
    feat_grp.create_dataset("_all_tag_keys", data=all_keys)

print(f"✓ Written: {out_path}")

# ------------------------------------------------------------------
# Quick sanity check with scanpy.read_10x_h5
# (Run this on the SAME machine where you generated the file)
# ------------------------------------------------------------------
print("\n[TEST] Reading back with scanpy.read_10x_h5...")
adata_test = sc.read_10x_h5(out_path)
print(f"✓ read_10x_h5 OK: {adata_test.n_obs} cells × {adata_test.n_vars} genes")

print("\nDONE. You can now copy this H5 to your Mac and load it in RECODE.")
print(f"Use it as: {out_path.name}")

BUILDING 10x v3 HDF5 (filtered_feature_bc_matrix.h5 compatible)

[1/4] Loading h5ad: /triumvirate/home/alexarol/breast_cancer_analysis/results/adata_triplenegative_epithelial_improved.h5ad
✓ Loaded: 7,561 cells × 33,514 genes

[2/4] Building CSR matrix (rows=genes, cols=cells)...
✓ Matrix shape (genes × cells): 33514 × 7561
  nnz: 16,435,959

[3/4] Preparing feature and barcode metadata...
✓ Features: 33514, Barcodes: 7561

[4/4] Writing 10x HDF5 file...


  utils.warn_names_duplicates("obs")


✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/tn_10x_filtered_feature_bc_matrix.h5

[TEST] Reading back with scanpy.read_10x_h5...
✓ read_10x_h5 OK: 7561 cells × 33514 genes

DONE. You can now copy this H5 to your Mac and load it in RECODE.
Use it as: tn_10x_filtered_feature_bc_matrix.h5


loop for conversion

In [6]:
import scanpy as sc
import numpy as np
import h5py
from scipy import sparse
from pathlib import Path

print("="*80)
print("BUILDING 10x v3 HDF5 FILES FOR ALL EPITHELIAL DATASETS")
print("="*80)

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
results = base / "results"

# Input h5ad → output 10x‑style h5
datasets = [
    ("adata_normal_epithelial_improved.h5ad",                "normal_10x_filtered_feature_bc_matrix.h5"),
    ("adata_er_positive_epithelial_improved.h5ad",           "erpos_10x_filtered_feature_bc_matrix.h5"),
    ("adata_her2_positive_epithelial_improved.h5ad",         "her2_10x_filtered_feature_bc_matrix.h5"),
    ("adata_triplenegative_epithelial_improved.h5ad",        "tn_10x_filtered_feature_bc_matrix.h5"),
    ("adata_triplenegative_brca1_epithelial_improved.h5ad",  "tn_brca1_10x_filtered_feature_bc_matrix.h5"),
    ("adata_brca1_preneoplastic_epithelial_improved.h5ad",   "preneo_10x_filtered_feature_bc_matrix.h5"),
]

def convert_to_10x(in_path: Path, out_path: Path):
    print("\n" + "-"*80)
    print(f"[START] {in_path.name}")
    print("-"*80)

    # 1) Load
    adata = sc.read_h5ad(in_path)
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    print(f"  ✓ Loaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")

    # 2) Matrix: genes × cells in CSC (10x convention)
    X = adata.X
    if not sparse.isspmatrix_csc(X) and not sparse.isspmatrix_csr(X):
        X = sparse.csr_matrix(X)
    X_gc = X.T.tocsc()
    n_genes, n_cells = X_gc.shape
    print(f"  ✓ Matrix (genes × cells): {n_genes} × {n_cells}   nnz={X_gc.nnz:,}")

    # 3) Metadata
    gene_ids   = adata.var_names.astype(str).tolist()
    gene_names = adata.var_names.astype(str).tolist()
    feature_type_value = "Gene Expression"
    genome_value = "GRCh38"
    barcodes = adata.obs_names.astype(str).tolist()

    max_gid  = max(len(g) for g in gene_ids)
    max_gnm  = max(len(g) for g in gene_names)
    max_ft   = len(feature_type_value)
    max_geno = len(genome_value)
    max_bc   = max(len(b) for b in barcodes)

    gene_ids_arr   = np.array([g.encode("ascii").ljust(max_gid)   for g in gene_ids],   dtype=f"S{max_gid}")
    gene_names_arr = np.array([g.encode("ascii").ljust(max_gnm)   for g in gene_names], dtype=f"S{max_gnm}")
    feature_type_arr = np.array([feature_type_value.encode("ascii").ljust(max_ft)] * n_genes, dtype=f"S{max_ft}")
    genome_arr       = np.array([genome_value.encode("ascii").ljust(max_geno)] * n_genes,   dtype=f"S{max_geno}")
    barcodes_arr   = np.array([b.encode("ascii").ljust(max_bc)   for b in barcodes],    dtype=f"S{max_bc}")

    print(f"  ✓ Features: {n_genes}, Barcodes: {n_cells}")

    # 4) Write H5
    with h5py.File(out_path, "w") as f:
        mat_grp = f.create_group("matrix")
        mat_grp.create_dataset("data",    data=X_gc.data.astype(np.float32),  compression="gzip")
        mat_grp.create_dataset("indices", data=X_gc.indices.astype(np.int32), compression="gzip")
        mat_grp.create_dataset("indptr",  data=X_gc.indptr.astype(np.int32),  compression="gzip")
        mat_grp.create_dataset("shape",   data=np.array([n_genes, n_cells], dtype=np.int64))

        mat_grp.create_dataset("barcodes", data=barcodes_arr)

        feat_grp = mat_grp.create_group("features")
        feat_grp.create_dataset("id",           data=gene_ids_arr)
        feat_grp.create_dataset("name",         data=gene_names_arr)
        feat_grp.create_dataset("feature_type", data=feature_type_arr)
        feat_grp.create_dataset("genome",       data=genome_arr)
        all_keys = np.array([b"genome"], dtype="S6")
        feat_grp.create_dataset("_all_tag_keys", data=all_keys)

    print(f"  ✓ Written: {out_path}")

    # 5) Quick check
    adata_test = sc.read_10x_h5(out_path)
    print(f"  ✓ read_10x_h5 OK: {adata_test.n_obs} cells × {adata_test.n_vars} genes")
    print(f"[DONE] {in_path.name}")

# Run conversion for all datasets
for in_name, out_name in datasets:
    in_path = results / in_name
    out_path = results / out_name
    if not in_path.exists():
        print(f"\n[SKIP] {in_name} (file not found)")
        continue
    convert_to_10x(in_path, out_path)

print("\nAll available datasets converted.")

BUILDING 10x v3 HDF5 FILES FOR ALL EPITHELIAL DATASETS

--------------------------------------------------------------------------------
[START] adata_normal_epithelial_improved.h5ad
--------------------------------------------------------------------------------


  utils.warn_names_duplicates("obs")


  ✓ Loaded: 83,522 cells × 33,514 genes
  ✓ Matrix (genes × cells): 33514 × 83522   nnz=168,940,670
  ✓ Features: 33514, Barcodes: 83522
  ✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/normal_10x_filtered_feature_bc_matrix.h5
  ✓ read_10x_h5 OK: 83522 cells × 33514 genes
[DONE] adata_normal_epithelial_improved.h5ad

--------------------------------------------------------------------------------
[START] adata_er_positive_epithelial_improved.h5ad
--------------------------------------------------------------------------------


  utils.warn_names_duplicates("obs")


  ✓ Loaded: 91,908 cells × 33,514 genes
  ✓ Matrix (genes × cells): 33514 × 91908   nnz=127,156,967
  ✓ Features: 33514, Barcodes: 91908
  ✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/erpos_10x_filtered_feature_bc_matrix.h5
  ✓ read_10x_h5 OK: 91908 cells × 33514 genes
[DONE] adata_er_positive_epithelial_improved.h5ad

--------------------------------------------------------------------------------
[START] adata_her2_positive_epithelial_improved.h5ad
--------------------------------------------------------------------------------


  utils.warn_names_duplicates("obs")


  ✓ Loaded: 19,693 cells × 33,514 genes
  ✓ Matrix (genes × cells): 33514 × 19693   nnz=44,937,332
  ✓ Features: 33514, Barcodes: 19693
  ✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/her2_10x_filtered_feature_bc_matrix.h5
  ✓ read_10x_h5 OK: 19693 cells × 33514 genes
[DONE] adata_her2_positive_epithelial_improved.h5ad

--------------------------------------------------------------------------------
[START] adata_triplenegative_epithelial_improved.h5ad
--------------------------------------------------------------------------------
  ✓ Loaded: 7,561 cells × 33,514 genes
  ✓ Matrix (genes × cells): 33514 × 7561   nnz=16,435,959
  ✓ Features: 33514, Barcodes: 7561


  utils.warn_names_duplicates("obs")


  ✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/tn_10x_filtered_feature_bc_matrix.h5
  ✓ read_10x_h5 OK: 7561 cells × 33514 genes
[DONE] adata_triplenegative_epithelial_improved.h5ad

--------------------------------------------------------------------------------
[START] adata_triplenegative_brca1_epithelial_improved.h5ad
--------------------------------------------------------------------------------


  utils.warn_names_duplicates("obs")


  ✓ Loaded: 14,186 cells × 33,514 genes
  ✓ Matrix (genes × cells): 33514 × 14186   nnz=34,351,393
  ✓ Features: 33514, Barcodes: 14186
  ✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/tn_brca1_10x_filtered_feature_bc_matrix.h5
  ✓ read_10x_h5 OK: 14186 cells × 33514 genes
[DONE] adata_triplenegative_brca1_epithelial_improved.h5ad

--------------------------------------------------------------------------------
[START] adata_brca1_preneoplastic_epithelial_improved.h5ad
--------------------------------------------------------------------------------


  utils.warn_names_duplicates("obs")


  ✓ Loaded: 7,644 cells × 33,514 genes
  ✓ Matrix (genes × cells): 33514 × 7644   nnz=11,610,363
  ✓ Features: 33514, Barcodes: 7644
  ✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/preneo_10x_filtered_feature_bc_matrix.h5
  ✓ read_10x_h5 OK: 7644 cells × 33514 genes
[DONE] adata_brca1_preneoplastic_epithelial_improved.h5ad

All available datasets converted.


In [8]:
import scanpy as sc
import numpy as np
import h5py
from scipy import sparse
from pathlib import Path

print("="*80)
print("BUILDING 10x v3 HDF5 (filtered_feature_bc_matrix.h5 compatible)")
print("="*80)

# ------------------------------------------------------------------
# 1. Load your h5ad
# ------------------------------------------------------------------
base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
tn_path = base / "results" / "/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_triplenegative_brca1_epithelial_improved.h5ad"
out_path = base / "results" / "/triumvirate/home/alexarol/breast_cancer_analysis/results/10x_filtered_feature_bc_matrix/tn_brca1_10x_filtered_feature_bc_matrix_2.h5"

print(f"\n[1/4] Loading h5ad: {tn_path}")
adata = sc.read_h5ad(tn_path)
adata.obs_names_make_unique()
adata.var_names_make_unique()

print(f"✓ Loaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")

# ------------------------------------------------------------------
# 2. Build CSR with rows = genes, cols = cells  (10x convention)
# ------------------------------------------------------------------
print("\n[2/4] Building CSR matrix (rows=genes, cols=cells)...")

X = adata.X
if not sparse.isspmatrix_csc(X) and not sparse.isspmatrix_csr(X):
    # adata.X is often CSR (cells × genes); we want genes × cells
    X = sparse.csr_matrix(X)

# current shape is cells × genes; transpose to genes × cells
X_gc = X.T.tocsc()  # genes × cells in CSC (10x uses column-major; better match)

# For 10x H5, columns = barcodes, so CSC with shape (n_genes, n_cells)
n_genes, n_cells = X_gc.shape
print(f"✓ Matrix shape (genes × cells): {n_genes} × {n_cells}")
print(f"  nnz: {X_gc.nnz:,}")

# ------------------------------------------------------------------
# 3. Prepare feature (gene) and barcode (cell) metadata
# ------------------------------------------------------------------
print("\n[3/4] Preparing feature and barcode metadata...")

gene_ids   = adata.var_names.astype(str).tolist()
gene_names = adata.var_names.astype(str).tolist()  # you could instead use symbols if separate
feature_type_value = "Gene Expression"
genome_value = "GRCh38"  # or "hg38" or whatever reference you used; Scanpy only needs a string

barcodes = adata.obs_names.astype(str).tolist()

# Fixed-length ASCII arrays
max_gid  = max(len(g) for g in gene_ids)
max_gnm  = max(len(g) for g in gene_names)
max_ft   = len(feature_type_value)
max_geno = len(genome_value)
max_bc   = max(len(b) for b in barcodes)

gene_ids_arr   = np.array([g.encode("ascii").ljust(max_gid)   for g in gene_ids],   dtype=f"S{max_gid}")
gene_names_arr = np.array([g.encode("ascii").ljust(max_gnm)   for g in gene_names], dtype=f"S{max_gnm}")
feature_type_arr = np.array([feature_type_value.encode("ascii").ljust(max_ft)] * n_genes, dtype=f"S{max_ft}")
genome_arr       = np.array([genome_value.encode("ascii").ljust(max_geno)] * n_genes,   dtype=f"S{max_geno}")
barcodes_arr   = np.array([b.encode("ascii").ljust(max_bc)   for b in barcodes],    dtype=f"S{max_bc}")

print(f"✓ Features: {n_genes}, Barcodes: {n_cells}")

# ------------------------------------------------------------------
# 4. Write 10x‑style HDF5 under /matrix
# ------------------------------------------------------------------
print("\n[4/4] Writing 10x HDF5 file...")
with h5py.File(out_path, "w") as f:
    mat_grp = f.create_group("matrix")

    # 10x uses CSC (column-major) with columns = barcodes
    # data / indices / indptr correspond to CSC of shape (n_genes, n_cells)
    mat_grp.create_dataset("data",    data=X_gc.data.astype(np.float32),  compression="gzip")
    mat_grp.create_dataset("indices", data=X_gc.indices.astype(np.int32), compression="gzip")
    mat_grp.create_dataset("indptr",  data=X_gc.indptr.astype(np.int32),  compression="gzip")
    mat_grp.create_dataset("shape",   data=np.array([n_genes, n_cells], dtype=np.int64))

    # barcodes
    mat_grp.create_dataset("barcodes", data=barcodes_arr)

    # features subgroup with required datasets
    feat_grp = mat_grp.create_group("features")
    feat_grp.create_dataset("id",           data=gene_ids_arr)
    feat_grp.create_dataset("name",         data=gene_names_arr)
    feat_grp.create_dataset("feature_type", data=feature_type_arr)
    feat_grp.create_dataset("genome",       data=genome_arr)
    # 10x also has _all_tag_keys listing extra fields
    all_keys = np.array([b"genome"], dtype="S6")  # only "genome" as extra tag
    feat_grp.create_dataset("_all_tag_keys", data=all_keys)

print(f"✓ Written: {out_path}")

# ------------------------------------------------------------------
# Quick sanity check with scanpy.read_10x_h5
# (Run this on the SAME machine where you generated the file)
# ------------------------------------------------------------------
print("\n[TEST] Reading back with scanpy.read_10x_h5...")
adata_test = sc.read_10x_h5(out_path)
print(f"✓ read_10x_h5 OK: {adata_test.n_obs} cells × {adata_test.n_vars} genes")

print("\nDONE. You can now copy this H5 to your Mac and load it in RECODE.")
print(f"Use it as: {out_path.name}")

BUILDING 10x v3 HDF5 (filtered_feature_bc_matrix.h5 compatible)

[1/4] Loading h5ad: /triumvirate/home/alexarol/breast_cancer_analysis/results/adata_triplenegative_brca1_epithelial_improved.h5ad
✓ Loaded: 14,186 cells × 33,514 genes

[2/4] Building CSR matrix (rows=genes, cols=cells)...
✓ Matrix shape (genes × cells): 33514 × 14186
  nnz: 34,351,393

[3/4] Preparing feature and barcode metadata...


  utils.warn_names_duplicates("obs")


✓ Features: 33514, Barcodes: 14186

[4/4] Writing 10x HDF5 file...
✓ Written: /triumvirate/home/alexarol/breast_cancer_analysis/results/10x_filtered_feature_bc_matrix/tn_brca1_10x_filtered_feature_bc_matrix_2.h5

[TEST] Reading back with scanpy.read_10x_h5...
✓ read_10x_h5 OK: 14186 cells × 33514 genes

DONE. You can now copy this H5 to your Mac and load it in RECODE.
Use it as: tn_brca1_10x_filtered_feature_bc_matrix_2.h5
