
# SECTION 1: SETUP — Imports & Configuration

WHY:
- centralize imports and paths at the top
- verify all required files exist before proceeding

DATA:
- external libraries (pandas, numpy, scanpy)
- file system paths to data and markers

OUTPUT:
- all imports loaded, all paths verified

NEXT USE:
- paths and libraries used in all sections


In [8]:
print(f'{"SECTION 1: SETUP":^80}\n')

#imported required libraries
import pandas as pd
import numpy as np
import scanpy as sc
from pathlib import Path
from collections import Counter
import warnings
import io
import contextlib
import gzip
from scipy.io import mmread

#suppressed benign warnings
warnings.filterwarnings('ignore', category=UserWarning)

#defined directories
BASE_DIR = Path('/triumvirate/home/alexarol/breast_cancer_analysis')
DATA_DIR = BASE_DIR / 'data'
RESULTS_DIR = BASE_DIR / 'results'
HUMANBREAST_DIR = BASE_DIR / 'HumanBreast10X-main'

#defined output directories for improved version
ANNOTATED_DIR_IMPROVED = RESULTS_DIR / 'annotated_samples_improved'
ANNOTATED_DIR_IMPROVED.mkdir(exist_ok=True)

#defined file paths
THEBIGBOSS = RESULTS_DIR / 'TheBigBoss_enhanced.csv'
IMMUNE_MARKERS = HUMANBREAST_DIR / 'Signatures' / 'ImmuneMarkers2.txt'
PAM50_MARKERS = HUMANBREAST_DIR / 'Signatures' / 'PAM50.txt'
RAW_DATA_DIR = DATA_DIR / 'GSE161529_RAW'
FEATURES_FILE = DATA_DIR / 'GSE161529_features.tsv'

#verified all files exist
print('verifying file accessibility \n')
files_to_check = {
    'TheBigBoss': THEBIGBOSS,
    'ImmuneMarkers': IMMUNE_MARKERS,
    'PAM50': PAM50_MARKERS,
    'Features': FEATURES_FILE,
}

for name, path in files_to_check.items():
    status = 'OK' if path.exists() else 'NOT OK'
    print(f'{status} {name}')

print(f'\n all files accessible')
print(f' output: {ANNOTATED_DIR_IMPROVED}\n')

                                SECTION 1: SETUP                                

verifying file accessibility 

OK TheBigBoss
OK ImmuneMarkers
OK PAM50
OK Features

 all files accessible
 output: /triumvirate/home/alexarol/breast_cancer_analysis/results/annotated_samples_improved



# SECTION 2: Load Markers — Gene Signatures

WHY:
- convert marker files into usable Python dictionaries
- prepare for cell type scoring in Section 5

DATA:
- ImmuneMarkers2.txt (10 immune cell types)
- PAM50.txt (5 epithelial molecular subtypes)

OUTPUT:
- immune_markers_dict: cell_type → gene list
- pam50_markers_dict: subtype → gene list

NEXT USE:
- used in scoring function (Section 3)

In [9]:
print(f'{"SECTION 2: LOAD MARKERS":^80}\n')

#loaded immune markers
immune_df = pd.read_csv(IMMUNE_MARKERS, sep='\t')
print(f'immune markers: {immune_df.shape[0]} genes')
print(f'  cell types: {", ".join(immune_df["CellType"].unique())}')

immune_markers_dict = {}
for cell_type in immune_df['CellType'].unique():
    genes = immune_df[immune_df['CellType'] == cell_type]['Signatures'].tolist()
    immune_markers_dict[cell_type] = genes

#loaded PAM50 markers
pam50_df = pd.read_csv(PAM50_MARKERS, sep='\t')
print(f'\nPAM50 markers: {pam50_df.shape[0]} genes')
print(f'  subtypes: {", ".join(pam50_df["Subtype"].unique())}')

pam50_markers_dict = {}
for subtype in pam50_df['Subtype'].unique():
    genes = pam50_df[pam50_df['Subtype'] == subtype]['Gene'].tolist()
    pam50_markers_dict[subtype] = genes

print(f'\n markers loaded\n')

                            SECTION 2: LOAD MARKERS                             

immune markers: 98 genes
  cell types: BCell, TCell, TCell2, NK, DC, Macro, Endo, Mega, Fibro, Fibro2

PAM50 markers: 50 genes
  subtypes: Basal, Her2, Normal, LumB, LumA

 markers loaded



# SECTION 3: HELPER FUNCTIONS

WHY:
- extract repeated logic into clean, testable functions
- improve code readability and maintainability
- enable reuse across notebooks

DATA:
- AnnData objects, marker dictionaries

OUTPUT:
- three production-ready functions

NEXT USE:
- called in Section 5 main pipeline

In [10]:
print(f'{"SECTION 3: DEFINE FUNCTIONS":^80}\n')

def load_sample(sample_row, features_file=FEATURES_FILE):
    '''load scRNA-seq data for single sample into AnnData format'''
    matrix_file = RAW_DATA_DIR / sample_row['MatrixFile']
    barcodes_file = RAW_DATA_DIR / sample_row['BarcodesFile']
    
    #loaded matrix (sparse format)
    with gzip.open(matrix_file, 'rb') as f:
        matrix = mmread(f).T.tocsr()
    
    #loaded barcodes
    barcodes = pd.read_csv(barcodes_file, header=None, names=['barcode'], compression='gzip')
    
    #loaded features
    features = pd.read_csv(features_file, sep='\t', header=None, names=['gene_id', 'gene_name', 'feature_type'])
    
    #created AnnData with explicit var_names
    adata = sc.AnnData(X=matrix, obs=barcodes.reset_index(drop=True))
    adata.var_names = features['gene_id'].values  #SET EXPLICITLY
    adata.var['gene_name'] = features['gene_name'].values
    adata.var['feature_type'] = features['feature_type'].values
    
    #added metadata
    adata.obs['sample_name'] = sample_row['SampleName']
    adata.obs['sample_type'] = sample_row['SampleType']
    adata.obs['geo_id'] = sample_row['GEO_ID']
    
    return adata


def fix_gene_names(adata, features_file=FEATURES_FILE):
    '''map Ensembl IDs to gene symbols, remove duplicates and NaN values'''
    #loaded mapping
    features = pd.read_csv(features_file, sep='\t', header=None, names=['gene_id', 'gene_name', 'feature_type'])
    gene_id_to_symbol = pd.Series(features['gene_name'].values, index=features['gene_id']).to_dict()
    
    #mapped IDs to symbols
    new_gene_names = [gene_id_to_symbol.get(gid, gid) for gid in adata.var_names]
    
    #removed duplicates and NaN
    keep_idx = []
    seen_genes = set()
    for idx, gene in enumerate(new_gene_names):
        if pd.isna(gene) or gene == 'nan':
            continue
        if gene in seen_genes:
            continue
        seen_genes.add(gene)
        keep_idx.append(idx)
    
    #subset and rename
    adata = adata[:, keep_idx].copy()
    adata.var_names = [new_gene_names[i] for i in keep_idx]
    
    return adata


def score_cell_types_corrected(adata, immune_markers, pam50_markers):
    '''two-tier cell type annotation: epithelial vs immune (tier 1), molecular subtype (tier 2)'''
    n_cells = adata.n_obs
    
    #tier 1: epithelial markers
    epithelial_markers = {'EPCAM': 1.0, 'KRT19': 1.0, 'KRT7': 1.0, 'KRT5': 1.0, 'CDH1': 1.0}
    
    epithelial_score = np.zeros(n_cells)
    epithelial_genes = [g for g in epithelial_markers.keys() if g in adata.var_names]
    if len(epithelial_genes) > 0:
        epithelial_score = np.asarray(adata[:, epithelial_genes].X.mean(axis=1)).flatten()
    
    #tier 1: immune markers
    immune_scores = np.zeros((n_cells, len(immune_markers)))
    for idx, (cell_type, genes) in enumerate(immune_markers.items()):
        present = [g for g in genes if g in adata.var_names]
        if len(present) > 0:
            scores = np.asarray(adata[:, present].X.mean(axis=1)).flatten()
            immune_scores[:, idx] = scores
    
    immune_types = list(immune_markers.keys())
    best_immune_idx = np.argmax(immune_scores, axis=1)
    best_immune_score = immune_scores.max(axis=1)
    best_immune_type = np.array([immune_types[i] for i in best_immune_idx])
    
    #tier 1: assign primary cell type
    cell_types = []
    for i in range(n_cells):
        if best_immune_score[i] > epithelial_score[i] and best_immune_score[i] > 0.05:
            cell_types.append(best_immune_type[i])
        elif epithelial_score[i] > 0.05:
            cell_types.append('Epithelial')
        else:
            cell_types.append('Unclassified')
    
    adata.obs['cell_type'] = cell_types
    adata.obs['epithelial_score'] = epithelial_score
    adata.obs['immune_score'] = best_immune_score
    
    #tier 2: molecular subtype (epithelial cells only)
    molecular_subtype = ['NA'] * n_cells
    pam50_scores = np.zeros((n_cells, len(pam50_markers)))
    
    epithelial_mask = np.array(adata.obs['cell_type'] == 'Epithelial')
    if epithelial_mask.sum() > 0:
        for idx, (subtype, genes) in enumerate(pam50_markers.items()):
            present = [g for g in genes if g in adata.var_names]
            if len(present) > 0:
                scores = np.asarray(adata[epithelial_mask, present].X.mean(axis=1)).flatten()
                pam50_scores[epithelial_mask, idx] = scores
    
    subtype_names = list(pam50_markers.keys())
    for i in np.where(epithelial_mask)[0]:
        best_idx = np.argmax(pam50_scores[i, :])
        molecular_subtype[i] = subtype_names[best_idx]
    
    adata.obs['molecular_subtype'] = molecular_subtype
    
    return adata

def get_notebook_globals(*var_names):
    '''retrieve variables from Jupyter notebook global scope'''
    g = globals()
    result = {}
    for name in var_names:
        if name not in g:
            raise NameError(f'{name} not defined - run earlier sections')
        result[name] = g[name]
    return result

print('OK functions defined\n')

                          SECTION 3: DEFINE FUNCTIONS                           

OK functions defined



In [12]:
print(f'{"SECTION 4: LOAD METADATA":^80}\n')

# Load metadata
thebigboss = pd.read_csv(THEBIGBOSS)
print(f'metadata loaded: {thebigboss.shape[0]} samples')
print(f'\ncolumns: {list(thebigboss.columns)}')
print(f'\nsample types:')
print(thebigboss['SampleType'].value_counts())

print(f'\nOK metadata ready\n')

                            SECTION 4: LOAD METADATA                            

metadata loaded: 69 samples

columns: ['GEO_ID', 'MatrixFile', 'BarcodesFile', 'SampleName', 'Title', 'CellNumAfter', 'GenesDetected', 'SampleType']

sample types:
SampleType
ER_Positive             27
Normal                  24
HER2_Positive            6
BRCA1_PreNeoplastic      4
TripleNegative_BRCA1     4
TripleNegative           4
Name: count, dtype: int64

OK metadata ready



# SECTION 0.5: Verify Cross-Cell Variables

WHY:
- ensure all required variables are in memory
- provides clear error messages if sections skipped

DATA:
- variables from SECTIONS 1-4

OUTPUT:
- verified that all dependencies loaded

NEXT USE:
- ensures diagnostic and pipeline cells work

In [13]:
print(f'{"SECTION 0.5: VERIFY VARIABLES":^80}\n')

#verified all required variables exist
required_vars = {
    'thebigboss': 'metadata (SECTION 4)',
    'load_sample': 'function (SECTION 3)',
    'fix_gene_names': 'function (SECTION 3)',
    'score_cell_types_corrected': 'function (SECTION 3)',
    'immune_markers_dict': 'markers (SECTION 2)',
    'pam50_markers_dict': 'markers (SECTION 2)',
    'ANNOTATED_DIR_IMPROVED': 'path (SECTION 1)',
}

print(f'checking required variables\n')
all_ok = True
for var_name, description in required_vars.items():
    exists = var_name in dir()
    status = 'OK' if exists else 'NOT OK'
    print(f'{status} {var_name:30} ({description})')
    if not exists:
        all_ok = False

if all_ok:
    print(f'\nOK all variables available\n')
else:
    print(f'\nNOT OK missing variables - run earlier sections first\n')
    raise RuntimeError('not all required variables defined')

                         SECTION 0.5: VERIFY VARIABLES                          

checking required variables

OK thebigboss                     (metadata (SECTION 4))
OK load_sample                    (function (SECTION 3))
OK fix_gene_names                 (function (SECTION 3))
OK score_cell_types_corrected     (function (SECTION 3))
OK immune_markers_dict            (markers (SECTION 2))
OK pam50_markers_dict             (markers (SECTION 2))
OK ANNOTATED_DIR_IMPROVED         (path (SECTION 1))

OK all variables available



In [14]:
# DIAGNOSTIC

print(f'{"DIAGNOSTIC: CHECKING GENE NAMES":^80}\n')

#retrieved cross-cell variables
(thebigboss, load_sample, fix_gene_names, score_cell_types_corrected,
 immune_markers_dict, pam50_markers_dict) = get_notebook_globals(
    'thebigboss', 'load_sample', 'fix_gene_names', 'score_cell_types_corrected',
    'immune_markers_dict', 'pam50_markers_dict'
).values()

#loaded one sample for testing
test_sample = thebigboss.iloc[0]
adata_test = load_sample(test_sample)

print(f'BEFORE fix_gene_names:')
print(f'  var_names type: {type(adata_test.var_names)}')
print(f'  first 5: {list(adata_test.var_names[:5])}')
print(f'  shape: {adata_test.shape}')

#fixed gene names
adata_test = fix_gene_names(adata_test)

print(f'\nAFTER fix_gene_names:')
print(f'  var_names type: {type(adata_test.var_names)}')
print(f'  first 5: {list(adata_test.var_names[:5])}')
print(f'  shape: {adata_test.shape}')

#checked if marker genes exist
print(f'\nmarker genes check:')
test_markers = ['EPCAM', 'KRT19', 'KRT7', 'KRT5', 'CDH1']
for gene in test_markers:
    exists = gene in adata_test.var_names
    print(f'  {gene}: {" OK FOUND" if exists else " NOT OK NOT FOUND"}')

#tested scoring
print(f'\nscoring test:')
with contextlib.redirect_stdout(io.StringIO()):
    adata_test = score_cell_types_corrected(
        adata_test, immune_markers_dict, pam50_markers_dict
    )

print(f'cell type distribution:')
print(adata_test.obs['cell_type'].value_counts())

epithelial_count = (adata_test.obs['cell_type'] == 'Epithelial').sum()
print(f'\nepithelial cells: {epithelial_count}')

if epithelial_count == 0:
    print(f'\n PROBLEM: 0 epithelial cells!')
    print(f'debug info:')
    print(f'  immune_score mean: {adata_test.obs["immune_score"].mean():.4f}')
    print(f'  epithelial_score mean: {adata_test.obs["epithelial_score"].mean():.4f}')
else:
    print(f'\n working correctly: {epithelial_count} epithelial cells')

                        DIAGNOSTIC: CHECKING GENE NAMES                         

BEFORE fix_gene_names:
  var_names type: <class 'pandas.core.indexes.base.Index'>
  first 5: ['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092', 'ENSG00000238009', 'ENSG00000239945']
  shape: (4966, 33538)

AFTER fix_gene_names:
  var_names type: <class 'pandas.core.indexes.base.Index'>
  first 5: ['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3']
  shape: (4966, 33514)

marker genes check:
  EPCAM:  OK FOUND
  KRT19:  OK FOUND
  KRT7:  OK FOUND
  KRT5:  OK FOUND
  CDH1:  OK FOUND

scoring test:
cell type distribution:
cell_type
Epithelial      3335
Fibro2          1185
Fibro            139
DC                92
TCell2            66
TCell             60
Endo              38
Macro             17
NK                15
Unclassified      10
BCell              9
Name: count, dtype: int64

epithelial cells: 3335

 working correctly: 3335 epithelial cells


# SECTION 4: LOAD METADATA — Sample Information

WHY:
- load sample information to iterate through all 69 samples

DATA:
- TheBigBoss_enhanced.csv

OUTPUT:
- metadata dataframe

NEXT USE:
- iterate in Section 5 pipeline


In [15]:
print(f'{"SECTION 4: LOAD METADATA":^80}\n')

thebigboss = pd.read_csv(THEBIGBOSS)
print(f'total samples: {len(thebigboss)}')
print(f'sample types:')
for stype, count in thebigboss['SampleType'].value_counts().items():
    print(f'  {stype:25} {count:3d}')
print(f'\n OK metadata loaded\n')

                            SECTION 4: LOAD METADATA                            

total samples: 69
sample types:
  ER_Positive                27
  Normal                     24
  HER2_Positive               6
  BRCA1_PreNeoplastic         4
  TripleNegative_BRCA1        4
  TripleNegative              4

 OK metadata loaded



# SECTION 5: MAIN PIPELINE — Annotation & Merging

## 5.1 TESTING MODE

WHY:
- test annotation logic on small sample before full 69-sample run
- verify code works before time-consuming batch processing

DATA:
- 100 cells from first sample

OUTPUT:
- cell type distribution (should match expected biology)

NEXT USE:
- proceed to 5.2 full pipeline

In [16]:

print(f'{"SECTION 5.1: TESTING MODE":^80}\n')

TEST_MODE = False  #set to True if you want to test first

if TEST_MODE:
    print(f'testing annotation on 100 cells from first sample.\n')
    
    test_sample = thebigboss.iloc[0]
    adata_test = load_sample(test_sample)
    adata_test = adata_test[:100].copy()
    
    print(f'test data: {adata_test.shape}')
    
    #fixed gene names
    adata_test = fix_gene_names(adata_test)
    print(f'after gene fix: {adata_test.shape}')
    
    #annotated
    with contextlib.redirect_stdout(io.StringIO()):
        adata_test = score_cell_types_corrected(
            adata_test, immune_markers_dict, pam50_markers_dict
        )
    
    print(f'\ncell type distribution:')
    print(adata_test.obs['cell_type'].value_counts())
    print(f'\n✓ test passed!\n')

                           SECTION 5.1: TESTING MODE                            



## 5.2 ANNOTATION PIPELINE (All Samples)

WHY:
- main processing: load → fix genes → annotate → merge
- runs automatically through all 69 samples

DATA:
- all 69 samples from metadata

OUTPUT:
- 69 annotated files + 5 merged epithelial datasets

NEXT USE:
- Phase 3 network analysis

In [17]:
#between 2-3 minutes
print(f'{"SECTION 5.2: ANNOTATION PIPELINE":^80}\n')
print(f'processing {len(thebigboss)} samples...\n')

#initialized tracking
results = []
failed_samples = []
sample_groups = {}

#main loop
for idx, (_, sample_row) in enumerate(thebigboss.iterrows(), start=1):
    sample_name = sample_row['SampleName']
    sample_type = sample_row['SampleType']
    
    try:
        print(f'[{idx:2d}/{len(thebigboss)}] {sample_name:20} ({sample_type:20})', end=' ', flush=True)
        
        #loaded sample
        adata = load_sample(sample_row)
        
        #fixed gene names
        adata = fix_gene_names(adata)
        
        #annotated (suppressed diagnostics)
        with contextlib.redirect_stdout(io.StringIO()):
            adata = score_cell_types_corrected(
                adata, immune_markers_dict, pam50_markers_dict
            )
        
        #calculated statistics
        n_epi = (adata.obs['cell_type'] == 'Epithelial').sum()
        n_immune = (adata.obs['cell_type'].isin(['TCell', 'TCell2', 'BCell', 'DC', 'Macro', 'NK'])).sum()
        n_stromal = (adata.obs['cell_type'].isin(['Fibro', 'Fibro2', 'Endo'])).sum()
        
        #saved to improved directory
        out_file = ANNOTATED_DIR_IMPROVED / f'{sample_name}_annotated.h5ad'
        adata.write(out_file)
        
        #logged results
        results.append({
            'sample_name': sample_name,
            'sample_type': sample_type,
            'total_cells': adata.n_obs,
            'epithelial': n_epi,
            'immune': n_immune,
            'stromal': n_stromal,
        })
        
        #subset epithelial for merging
        epithelial_mask = adata.obs['cell_type'] == 'Epithelial'
        adata_epi = adata[epithelial_mask].copy()
        
        if len(adata_epi) > 0:
            if sample_type not in sample_groups:
                sample_groups[sample_type] = []
            sample_groups[sample_type].append(adata_epi)
        
        print(f'OK ({n_epi:5,} epi)')
        
    except Exception as e:
        print(f'NOT OK {str(e)[:30]}')
        failed_samples.append(sample_name)

                        SECTION 5.2: ANNOTATION PIPELINE                        

processing 69 samples...

[ 1/69] N-0092-total         (Normal              ) OK (3,335 epi)
[ 2/69] N-0019-total         (Normal              ) OK (4,518 epi)
[ 3/69] N-0280-epi           (Normal              ) OK (  891 epi)
[ 4/69] N-0093-epi           (Normal              ) OK (8,852 epi)
[ 5/69] N-0093-total         (Normal              ) OK (3,421 epi)
[ 6/69] N-1469-epi           (Normal              ) OK (2,509 epi)
[ 7/69] N-0408-epi           (Normal              ) OK (2,799 epi)
[ 8/69] N-1105-epi           (Normal              ) OK (4,692 epi)
[ 9/69] N-0230.17-total      (Normal              ) OK (  794 epi)
[10/69] N-0064-epi           (Normal              ) OK (4,011 epi)
[11/69] N-0064-total         (Normal              ) OK (  814 epi)
[12/69] N-0230.16-epi        (Normal              ) OK (1,923 epi)
[13/69] N-0233-total         (Normal              ) OK (1,911 epi)
[14/69] N-0169-total 

## 5.3 MERGE BY SAMPLE TYPE

In [18]:
print(f'\n{"SECTION 5.3: MERGE BY SAMPLE TYPE":^80}\n')

#merged epithelial cells by group
merged_datasets = {}

for group_type in sorted(sample_groups.keys()):
    adata_list = sample_groups[group_type]
    n_samples = len(adata_list)
    
    print(f'merging {group_type:25} ({n_samples:2d} samples)...', end=' ', flush=True)
    
    #concatenated
    adata_merged = sc.concat(adata_list, axis=0, join='inner')
    
    #saved to improved directory
    out_file = RESULTS_DIR / f'adata_{group_type.lower()}_epithelial_improved.h5ad'
    adata_merged.write(out_file)
    
    merged_datasets[group_type] = {
        'cells': adata_merged.n_obs,
        'genes': adata_merged.n_vars,
        'samples': n_samples,
        'file': out_file.name,
    }
    
    print(f'OK {adata_merged.n_obs:,} cells')


                       SECTION 5.3: MERGE BY SAMPLE TYPE                        

merging BRCA1_PreNeoplastic       ( 4 samples)... OK 7,644 cells
merging ER_Positive               (27 samples)... OK 91,908 cells
merging HER2_Positive             ( 6 samples)... OK 19,693 cells
merging Normal                    (24 samples)... OK 83,522 cells
merging TripleNegative            ( 4 samples)... OK 7,561 cells
merging TripleNegative_BRCA1      ( 4 samples)... OK 14,186 cells


# SECTION 6: Summary Table

WHY:
- display results in readable format
- verify all groups created successfully

DATA:
- merged_datasets

OUTPUT:
- formatted summary

NEXT USE:
- reference for Phase 3

In [19]:
print(f'{"SECTION 6: SUMMARY TABLE":^80}')

print(f'EPITHELIAL-ONLY DATASETS (IMPROVED VERSION):\n')

for group_type in sorted(merged_datasets.keys()):
    info = merged_datasets[group_type]
    print(f'{group_type:25}')
    print(f'  cells:   {info["cells"]:,}')
    print(f'  genes:   {info["genes"]:,}')
    print(f'  samples: {info["samples"]}')
    print(f'  file:    {info["file"]}\n')

total_cells = sum([info['cells'] for info in merged_datasets.values()])
print(f'TOTAL EPITHELIAL CELLS: {total_cells:,}\n')

#saved summary
results_df = pd.DataFrame(results)
summary_file = RESULTS_DIR / 'annotation_summary_all_samples_improved.csv'
results_df.to_csv(summary_file, index=False)

print(f'✓ summary saved: {summary_file.name}\n')

                            SECTION 6: SUMMARY TABLE                            
EPITHELIAL-ONLY DATASETS (IMPROVED VERSION):

BRCA1_PreNeoplastic      
  cells:   7,644
  genes:   33,514
  samples: 4
  file:    adata_brca1_preneoplastic_epithelial_improved.h5ad

ER_Positive              
  cells:   91,908
  genes:   33,514
  samples: 27
  file:    adata_er_positive_epithelial_improved.h5ad

HER2_Positive            
  cells:   19,693
  genes:   33,514
  samples: 6
  file:    adata_her2_positive_epithelial_improved.h5ad

Normal                   
  cells:   83,522
  genes:   33,514
  samples: 24
  file:    adata_normal_epithelial_improved.h5ad

TripleNegative           
  cells:   7,561
  genes:   33,514
  samples: 4
  file:    adata_triplenegative_epithelial_improved.h5ad

TripleNegative_BRCA1     
  cells:   14,186
  genes:   33,514
  samples: 4
  file:    adata_triplenegative_brca1_epithelial_improved.h5ad

TOTAL EPITHELIAL CELLS: 224,514

✓ summary saved: annotation_summary_all_sam

# SECTION 7: Report
WHY:
- document Phase 2 completion
- create record for thesis Methods

DATA:
- all pipeline outputs

OUTPUT:
- markdown report

In [20]:
print(f'{"SECTION 7: COMPLETION REPORT":^80}\n')

report = f'''

PHASE 2: CELL TYPE ANNOTATION (IMPROVED VERSION)

INPUT:
  - 69 samples from GSE161529 (HumanBreast10X)
  - {results_df['total_cells'].sum():,} total cells

PROCESSING:
  - two-tier annotation (epithelial vs. immune)
  - molecular subtype classification
  - {len(results_df)} samples processed
  - {len(failed_samples)} failures

OUTPUT:
  - 69 annotated h5ad files (improved directory)
  - {len(merged_datasets)} epithelial-only merged datasets
  - Summary statistics (CSV)

EPITHELIAL DATASETS READY FOR PHASE 3:
'''

for group_type in sorted(merged_datasets.keys()):
    info = merged_datasets[group_type]
    report += f'  • {group_type:25} {info["cells"]:,} cells from {info["samples"]} samples\n'

report += f'''
FILES SAVED (with _improved suffix):
  - annotated_samples_improved/
  - adata_*_epithelial_improved.h5ad
  - annotation_summary_all_samples_improved.csv

PHASE 2 COMPLETE - READY FOR PHASE 3

'''

#saved report
report_file = RESULTS_DIR / 'phase2_completion_report_improved.txt'
with open(report_file, 'w') as f:
    f.write(report)

print(report)
print(f'OK report saved: {report_file.name}\n')

                          SECTION 7: COMPLETION REPORT                          



PHASE 2: CELL TYPE ANNOTATION (IMPROVED VERSION)

INPUT:
  - 69 samples from GSE161529 (HumanBreast10X)
  - 428,024 total cells

PROCESSING:
  - two-tier annotation (epithelial vs. immune)
  - molecular subtype classification
  - 69 samples processed
  - 0 failures

OUTPUT:
  - 69 annotated h5ad files (improved directory)
  - 6 epithelial-only merged datasets
  - Summary statistics (CSV)

EPITHELIAL DATASETS READY FOR PHASE 3:
  • BRCA1_PreNeoplastic       7,644 cells from 4 samples
  • ER_Positive               91,908 cells from 27 samples
  • HER2_Positive             19,693 cells from 6 samples
  • Normal                    83,522 cells from 24 samples
  • TripleNegative            7,561 cells from 4 samples
  • TripleNegative_BRCA1      14,186 cells from 4 samples

FILES SAVED (with _improved suffix):
  - annotated_samples_improved/
  - adata_*_epithelial_improved.h5ad
  - annotation_summary_all_sam

Each file has:

X: Gene expression matrix (cells × genes)

obs: Cell metadata (cell_type, sample_name, etc.)

var: Gene metadata (gene_id, gene_name, etc.)

In [21]:
import scanpy as sc

adata = sc.read_h5ad('/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_normal_epithelial_improved.h5ad')

print(f"Cells: {adata.n_obs:,}")
print(f"Genes: {adata.n_vars:,}")
print(f"\nMetadata columns: {list(adata.obs.columns)}")
print(f"\nCell types:\n{adata.obs['cell_type'].value_counts()}")
print(f"\nGene columns: {list(adata.var.columns)}")

Cells: 83,522
Genes: 33,514

Metadata columns: ['barcode', 'sample_name', 'sample_type', 'geo_id', 'cell_type', 'epithelial_score', 'immune_score', 'molecular_subtype']

Cell types:
cell_type
Epithelial    83522
Name: count, dtype: int64

Gene columns: []


In [24]:
import h5py

with h5py.File('/triumvirate/home/alexarol/breast_cancer_analysis/results/adata_normal_epithelial_improved.h5ad', 'r') as f:
    print("File structure:")
    for key in f.keys():
        print(f"  {key}")
        if isinstance(f[key], h5py.Group):
            for subkey in f[key].keys():
                print(f"    └─ {subkey}")

File structure:
  X
    └─ data
    └─ indices
    └─ indptr
  layers
  obs
    └─ _index
    └─ barcode
    └─ cell_type
    └─ epithelial_score
    └─ geo_id
    └─ immune_score
    └─ molecular_subtype
    └─ sample_name
    └─ sample_type
  obsm
  obsp
  uns
  var
    └─ _index
  varm
  varp
