# LOGS

In [1]:
print(f'{"SECTION 1: SETUP":^80}\n')

#imported required libraries
!pip install mygene
!pip install gseapy
!pip install networkx
!pip install screcode
import pandas as pd
import numpy as np
import scanpy as sc
from pathlib import Path
import warnings
import io
import contextlib
from scipy import stats
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from collections import defaultdict
import mygene
import gseapy as gp
import seaborn as sns
from scipy.stats import spearmanr
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from scipy.stats import mannwhitneyu
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.integrate import solve_ivp
import os
import subprocess
import gzip
import requests
import networkx as nx 
from io import StringIO
from sklearn.decomposition import PCA
import time

#suppressed benign warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

#defined directories
BASE_DIR = Path('/triumvirate/home/alexarol/breast_cancer_analysis')
DATA_DIR = BASE_DIR / 'data'
RESULTS_DIR = BASE_DIR / 'results'

#defined phase 3 output directory
PHASE3_DIR = RESULTS_DIR / 'phase3_networx_2026'
PHASE3_DIR.mkdir(exist_ok=True)

#defined input files (from Phase 2)
epithelial_files = {
    'Normal': RESULTS_DIR / 'adata_normal_epithelial_improved.h5ad',
    'ER_Positive': RESULTS_DIR / 'adata_er_positive_epithelial_improved.h5ad',
    'HER2_Positive': RESULTS_DIR / 'adata_her2_positive_epithelial_improved.h5ad',
    'TripleNegative': RESULTS_DIR / 'adata_triplenegative_epithelial_improved.h5ad',
    'TripleNegative_BRCA1': RESULTS_DIR / 'adata_triplenegative_brca1_epithelial_improved.h5ad',
    'Preneoplastic': RESULTS_DIR / 'adata_brca1_preneoplastic_epithelial_improved.h5ad'
}

#verified input files exist
print(f'verifying Phase 2 output files\n')
for group_name, file_path in epithelial_files.items():
    status = 'OK' if file_path.exists() else 'NOT OK'
    print(f'{status} {group_name:25} {file_path.name}')

print(f'\n output directory: {PHASE3_DIR}\n')

"""#defined WGCNA parameters (explained in notebook markdown)
WGCNA_PARAMS = {
    'soft_power_range': range(1, 31),          #test β from 1 to 30
    'min_module_size': 30,                     #minimum genes per module
    'deep_split': 2,                           #moderate aggressiveness
    'correlation_method': 'pearson',           #linear relationship
    'max_block_size': 20000,                   #memory efficiency
    'r_squared_threshold': 0.85,               #scale-free fit criterion
}
print(f'WGCNA parameters configured:')
for param, value in WGCNA_PARAMS.items():
    if param != 'soft_power_range':
        print(f'  {param:25} {value}')
print(f'\nOK setup complete\n')"""

                                SECTION 1: SETUP                                

verifying Phase 2 output files

OK Normal                    adata_normal_epithelial_improved.h5ad
OK ER_Positive               adata_er_positive_epithelial_improved.h5ad
OK HER2_Positive             adata_her2_positive_epithelial_improved.h5ad
OK TripleNegative            adata_triplenegative_epithelial_improved.h5ad
OK TripleNegative_BRCA1      adata_triplenegative_brca1_epithelial_improved.h5ad
OK Preneoplastic             adata_brca1_preneoplastic_epithelial_improved.h5ad

 output directory: /triumvirate/home/alexarol/breast_cancer_analysis/results/phase3_networx_2026



"#defined WGCNA parameters (explained in notebook markdown)\nWGCNA_PARAMS = {\n    'soft_power_range': range(1, 31),          #test β from 1 to 30\n    'min_module_size': 30,                     #minimum genes per module\n    'deep_split': 2,                           #moderate aggressiveness\n    'correlation_method': 'pearson',           #linear relationship\n    'max_block_size': 20000,                   #memory efficiency\n    'r_squared_threshold': 0.85,               #scale-free fit criterion\n}\nprint(f'WGCNA parameters configured:')\nfor param, value in WGCNA_PARAMS.items():\n    if param != 'soft_power_range':\n        print(f'  {param:25} {value}')\nprint(f'\nOK setup complete\n')"

In [2]:
# ==== MAIN RECODE SETUP CELL (no analysis starts here) ========================
import scanpy as sc
import screcode
import numpy as np
import pandas as pd
from pathlib import Path

# Base paths (adapt if needed)
base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
matrix_dir = base / "results" / "10x_filtered_feature_bc_matrix"
recode_out_dir = base / "results" / "recode_outputs"
recode_out_dir.mkdir(exist_ok=True, parents=True)

# ---------------------------------------------------------------------
# Helper: stratified subsampling by a column in .obs
# ---------------------------------------------------------------------
def stratified_subsample_adata(
    adata,
    max_cells: int = 30000,
    by: str | None = None,
    random_state: int = 0,
):
    """
    Return a subsampled AnnData with at most max_cells cells.
    If by is given (e.g. 'molecular_subtype' or 'sample_name'),
    subsampling is done separately per group.

    If adata.n_obs <= max_cells, returns adata unchanged.
    """
    if adata.n_obs <= max_cells or by is None or by not in adata.obs.columns:
        # Plain random subsample (or no subsampling needed)
        if adata.n_obs <= max_cells:
            return adata.copy()
        np.random.seed(random_state)
        keep = np.random.choice(adata.n_obs, max_cells, replace=False)
        keep = np.sort(keep)
        return adata[keep].copy()

    # Stratified: cap each group, then concatenate
    np.random.seed(random_state)
    groups = adata.obs[by].astype(str)
    idxs = []

    # Simple rule: if group has <= max_per_group, keep all;
    # otherwise sample max_per_group from that group.
    n_groups = groups.nunique()
    max_per_group = max_cells // n_groups if n_groups > 0 else max_cells
    max_per_group = max(max_per_group, 2000)  # avoid too few per group

    for g, group_idx in groups.groupby(groups).groups.items():
        g_idx = np.array(list(group_idx))
        if len(g_idx) <= max_per_group:
            idxs.append(g_idx)
        else:
            sel = np.random.choice(g_idx, max_per_group, replace=False)
            idxs.append(sel)

    idxs = np.concatenate(idxs)
    idxs = np.unique(idxs)
    # If still too many, do a final trim to max_cells
    if len(idxs) > max_cells:
        idxs = np.random.choice(idxs, max_cells, replace=False)
        idxs = np.sort(idxs)
    return adata[idxs].copy()

# ---------------------------------------------------------------------
# Helper: run RECODE on an AnnData and extract outputs
# ---------------------------------------------------------------------
def run_recode_on_adata(
    adata,
    label: str,
    out_dir: Path,
    max_cells_for_learning: int = 30000,
    subsample_by: str | None = "molecular_subtype",  # or "sample_name", or None
    random_state: int = 0,
):
    """
    Run RECODE on the given AnnData, with optional subsampling when n_cells > max_cells_for_learning.
    Saves:
      - *_RECODE_gene_stats.csv (per-gene significance + variances)
      - returns (adata_recode, sig_genes_list, gene_stats_df)
    """
    print("=" * 80)
    print(f"[{label}] RECODE setup")
    print(f"  Cells: {adata.n_obs:,}, Genes: {adata.n_vars:,}")

    # Decide whether to subsample for learning
    if adata.n_obs > max_cells_for_learning:
        print(f"  → Subsampling to at most {max_cells_for_learning} cells for RECODE learning.")
        if subsample_by is not None and subsample_by in adata.obs.columns:
            print(f"    Stratified by '{subsample_by}'.")
        adata_learn = stratified_subsample_adata(
            adata,
            max_cells=max_cells_for_learning,
            by=subsample_by,
            random_state=random_state,
        )
        print(f"    Subsampled: {adata_learn.n_obs:,} cells.")
    else:
        print("  → Using all cells for RECODE learning (no subsampling).")
        adata_learn = adata

    # Run RECODE
    recode = screcode.RECODE(
        assay="RNA",
        version=2,
        solver="auto",          # uses randomized + downsampling when n > 20k
        downsampling_rate=0.2,  # default from the paper
        fast_algorithm=True,
        log_normalize=True,
        target_sum=1e5,
        verbose=True,
    )
    print("  Starting RECODE.")
    adata_recode = recode.fit_transform(adata_learn)
    print("  RECODE finished.")

    # Extract per-gene significance + normalized variance (these are gene-level!)
    sig = pd.Series(recode.significance_, index=adata_recode.var_names, name="significance_RECODE")
    norm_var = pd.Series(
        recode.normalized_variance_,
        index=adata_recode.var_names,
        name="normalized_variance_RECODE",
    )

    gene_stats = pd.concat([sig, norm_var], axis=1)
    gene_stats.insert(0, "gene", gene_stats.index)

    # Define significant genes as HVGs
    sig_mask = gene_stats["significance_RECODE"].astype(str) == "significant"
    sig_genes = gene_stats.loc[sig_mask, "gene"].tolist()
    print(f"  Significant genes (HVGs): {sig_mask.sum()}")

    # Save only gene stats here; full matrices can be handled per-dataset in separate cells
    out_dir.mkdir(exist_ok=True, parents=True)
    stats_path = out_dir / f"{label}_RECODE_gene_stats.csv"
    gene_stats.to_csv(stats_path, index=False)
    print(f"  Saved gene stats: {stats_path}")

    return adata_recode, sig_genes, gene_stats, recode

In [3]:
def split_adata_into_chunks(adata, chunk_size: int = 10000, random_state: int = 0):
    """
    Shuffle cells and split AnnData into disjoint chunks of size up to chunk_size.
    Returns a list of AnnData objects [chunk_1, ..., chunk_K].
    """
    np.random.seed(random_state)
    idx = np.arange(adata.n_obs)
    np.random.shuffle(idx)

    chunks = []
    for start in range(0, adata.n_obs, chunk_size):
        end = min(start + chunk_size, adata.n_obs)
        sel = idx[start:end]
        sel = np.sort(sel)
        chunks.append(adata[sel].copy())
    return chunks

# Normal 

In [5]:
# ==== CREATE AND SAVE NORMAL CHUNKS (ONE-TIME) ==========================
normal_path = base / "results" / "adata_normal_epithelial_improved.h5ad"
adata_normal = sc.read_h5ad(normal_path)
adata_normal.obs_names_make_unique()
print("Normal:", adata_normal.n_obs, "cells,", adata_normal.n_vars, "genes")

def save_adata_chunks(adata, label: str, chunk_size: int = 10000, out_dir: Path = recode_out_dir):
    """
    Shuffle cells, split into disjoint chunks, save each as .h5ad.
    """
    np.random.seed(0)
    idx = np.arange(adata.n_obs)
    np.random.shuffle(idx)

    out_dir.mkdir(exist_ok=True, parents=True)
    n_chunks = 0
    for start in range(0, adata.n_obs, chunk_size):
        end = min(start + chunk_size, adata.n_obs)
        sel = np.sort(idx[start:end])
        chunk = adata[sel].copy()
        n_chunks += 1
        chunk_label = f"{label}_chunk{n_chunks}"
        chunk_path = out_dir / f"{chunk_label}.h5ad"
        chunk.write(chunk_path)
        print(f"Saved {chunk_label}: {chunk.n_obs} cells → {chunk_path}")
        del chunk
    print(f"Total chunks for {label}: {n_chunks}")

save_adata_chunks(adata_normal, label="Normal", chunk_size=10000, out_dir=recode_out_dir)

Normal: 83522 cells, 33514 genes
Saved Normal_chunk1: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk1.h5ad
Saved Normal_chunk2: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk2.h5ad
Saved Normal_chunk3: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk3.h5ad
Saved Normal_chunk4: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk4.h5ad
Saved Normal_chunk5: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk5.h5ad
Saved Normal_chunk6: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk6.h5ad
Saved Normal_chunk7: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk7.h5ad
Saved Normal_chunk8: 10000 cells → /triumvirate/home/alexarol/breast_cancer_an

In [6]:
# ==== RECODE ON Normal_chunk1 ==========================================
chunk_path = recode_out_dir / "Normal_chunk1.h5ad"
adata_chunk1 = sc.read_h5ad(chunk_path)
print("Chunk1:", adata_chunk1.n_obs, "cells,", adata_chunk1.n_vars, "genes")

_, normal_chunk1_sig_genes, normal_chunk1_stats = run_recode_on_adata(
    adata=adata_chunk1,
    label="Normal_chunk1",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk1.n_obs,  # no subsampling
    subsample_by=None,
    random_state=0,
)

Chunk1: 10000 cells, 33514 genes
[Normal_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11564, '#non-significant genes': 8604, '#silent genes': 9802, 'ell': 97, 'Elapsed time': '0h 0m 24s 665ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11564
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk1_RECODE_gene_stats.csv


In [4]:
# ==== RECODE ON Normal_chunk2 ==========================================
chunk_path = recode_out_dir / "Normal_chunk2.h5ad"
adata_chunk2 = sc.read_h5ad(chunk_path)
print("Chunk2:", adata_chunk2.n_obs, "cells,", adata_chunk2.n_vars, "genes")

_, normal_chunk2_sig_genes, normal_chunk2_stats = run_recode_on_adata(
    adata=adata_chunk2,
    label="Normal_chunk2",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk2.n_obs,
    subsample_by=None,
    random_state=0,
)

Chunk2: 10000 cells, 33514 genes
[Normal_chunk2] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11607, '#non-significant genes': 8861, '#silent genes': 9703, 'ell': 95, 'Elapsed time': '0h 0m 24s 629ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11607
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk2_RECODE_gene_stats.csv


In [4]:
# ==== RECODE ON Normal_chunk3 ==========================================
chunk_path = recode_out_dir / "Normal_chunk3.h5ad"
adata_chunk3 = sc.read_h5ad(chunk_path)
print("Chunk3:", adata_chunk3.n_obs, "cells,", adata_chunk3.n_vars, "genes")

_, normal_chunk3_sig_genes, normal_chunk3_stats = run_recode_on_adata(
    adata=adata_chunk3,
    label="Normal_chunk3",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk3.n_obs,
    subsample_by=None,
    random_state=0,
)

Chunk3: 10000 cells, 33514 genes
[Normal_chunk3] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11565, '#non-significant genes': 8707, '#silent genes': 9581, 'ell': 89, 'Elapsed time': '0h 0m 23s 771ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11565
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk3_RECODE_gene_stats.csv


In [4]:
# ==== RECODE ON Normal_chunk4 ==========================================
chunk_path = recode_out_dir / "Normal_chunk4.h5ad"
adata_chunk4 = sc.read_h5ad(chunk_path)
print("Chunk4:", adata_chunk4.n_obs, "cells,", adata_chunk4.n_vars, "genes")

_, normal_chunk4_sig_genes, normal_chunk4_stats = run_recode_on_adata(
    adata=adata_chunk4,
    label="Normal_chunk4",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk4.n_obs,
    subsample_by=None,
    random_state=0,
)

Chunk4: 10000 cells, 33514 genes
[Normal_chunk4] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11536, '#non-significant genes': 8716, '#silent genes': 9666, 'ell': 92, 'Elapsed time': '0h 0m 23s 747ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11536
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk4_RECODE_gene_stats.csv


In [4]:
# ==== RECODE ON Normal_chunk5 ==========================================
chunk_path = recode_out_dir / "Normal_chunk5.h5ad"
adata_chunk5 = sc.read_h5ad(chunk_path)
print("Chunk5:", adata_chunk5.n_obs, "cells,", adata_chunk5.n_vars, "genes")

_, normal_chunk5_sig_genes, normal_chunk5_stats = run_recode_on_adata(
    adata=adata_chunk5,
    label="Normal_chunk5",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk5.n_obs,
    subsample_by=None,
    random_state=0,
)

Chunk5: 10000 cells, 33514 genes
[Normal_chunk5] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11525, '#non-significant genes': 8833, '#silent genes': 9682, 'ell': 94, 'Elapsed time': '0h 0m 23s 798ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11525
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk5_RECODE_gene_stats.csv


In [4]:
# ==== RECODE ON Normal_chunk6 ==========================================
chunk_path = recode_out_dir / "Normal_chunk6.h5ad"
adata_chunk6 = sc.read_h5ad(chunk_path)
print("Chunk6:", adata_chunk6.n_obs, "cells,", adata_chunk6.n_vars, "genes")

_, normal_chunk6_sig_genes, normal_chunk6_stats = run_recode_on_adata(
    adata=adata_chunk6,
    label="Normal_chunk6",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk6.n_obs,
    subsample_by=None,
    random_state=0,
)

Chunk6: 10000 cells, 33514 genes
[Normal_chunk6] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11529, '#non-significant genes': 8690, '#silent genes': 9725, 'ell': 92, 'Elapsed time': '0h 0m 23s 720ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11529
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk6_RECODE_gene_stats.csv


In [4]:
# ==== RECODE ON Normal_chunk7 ==========================================
chunk_path = recode_out_dir / "Normal_chunk7.h5ad"
adata_chunk7 = sc.read_h5ad(chunk_path)
print("Chunk7:", adata_chunk7.n_obs, "cells,", adata_chunk7.n_vars, "genes")

_, normal_chunk7_sig_genes, normal_chunk7_stats = run_recode_on_adata(
    adata=adata_chunk7,
    label="Normal_chunk7",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk7.n_obs,
    subsample_by=None,
    random_state=0,
)


Chunk7: 10000 cells, 33514 genes
[Normal_chunk7] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11708, '#non-significant genes': 8649, '#silent genes': 9718, 'ell': 96, 'Elapsed time': '0h 0m 24s 648ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11708
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk7_RECODE_gene_stats.csv


In [4]:
# ==== RECODE ON Normal_chunk8 ==========================================
chunk_path = recode_out_dir / "Normal_chunk8.h5ad"
adata_chunk8 = sc.read_h5ad(chunk_path)
print("Chunk8:", adata_chunk8.n_obs, "cells,", adata_chunk8.n_vars, "genes")

_, normal_chunk8_sig_genes, normal_chunk8_stats = run_recode_on_adata(
    adata=adata_chunk8,
    label="Normal_chunk8",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk8.n_obs,
    subsample_by=None,
    random_state=0,
)


Chunk8: 10000 cells, 33514 genes
[Normal_chunk8] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11605, '#non-significant genes': 8606, '#silent genes': 9671, 'ell': 93, 'Elapsed time': '0h 0m 23s 619ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11605
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk8_RECODE_gene_stats.csv


In [5]:
# ==== RECODE ON Normal_chunk9 ==========================================
chunk_path = recode_out_dir / "Normal_chunk9.h5ad"
adata_chunk9 = sc.read_h5ad(chunk_path)
print("Chunk9:", adata_chunk9.n_obs, "cells,", adata_chunk9.n_vars, "genes")

_, normal_chunk9_sig_genes, normal_chunk9_stats = run_recode_on_adata(
    adata=adata_chunk9,
    label="Normal_chunk9",
    out_dir=recode_out_dir,
    max_cells_for_learning=adata_chunk9.n_obs,
    subsample_by=None,
    random_state=0,
)


Chunk9: 3522 cells, 33514 genes
[Normal_chunk9] RECODE setup
  Cells: 3,522, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 13371, '#non-significant genes': 8174, '#silent genes': 11969, 'ell': 128, 'Elapsed time': '0h 0m 19s 062ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 13371
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_chunk9_RECODE_gene_stats.csv


now i will merge everything: 
- taking the per‑chunk HVG calls and turning them into one final HVG list for the whole Normal dataset
- For each chunk, it loads the *_RECODE_gene_stats.csv file and grabs all genes where significance_RECODE == "significant" → one list per chunk
- Counter counts how many chunks each gene was called significant in
- normal_merged_union: all genes that were significant in at least one chunk (the union of HVGs)
- normal_merged_atleast2: only genes that were significant in 2 or more chunks (more robust HVGs)
- Both lists are saved to .txt so you can later subset the full Normal AnnData to those genes for WGCNA

In [6]:
from collections import Counter

# list all per-chunk stats you have
chunk_labels = ["Normal_chunk1", "Normal_chunk2", "Normal_chunk3", "Normal_chunk4", "Normal_chunk5", "Normal_chunk6", "Normal_chunk7", "Normal_chunk8", "Normal_chunk9"]  # extend as needed

all_sig_lists = []

for lab in chunk_labels:
    stats_path = recode_out_dir / f"{lab}_RECODE_gene_stats.csv"
    df = pd.read_csv(stats_path)
    sig_genes = df.loc[df["significance_RECODE"].astype(str) == "significant", "gene"].tolist()
    all_sig_lists.append(sig_genes)
    print(lab, "significant genes:", len(sig_genes))

gene_counter = Counter()
for sig_list in all_sig_lists:
    gene_counter.update(sig_list)

normal_merged_union = sorted(gene_counter.keys())
normal_merged_atleast2 = sorted([g for g, c in gene_counter.items() if c >= 2])

print("Normal union HVGs:", len(normal_merged_union))
print("Normal HVGs significant in ≥2 chunks:", len(normal_merged_atleast2))

# save merged lists
pd.Series(normal_merged_union, name="gene").to_csv(
    recode_out_dir / "Normal_RECODE_sig_genes_union.txt", index=False
)
pd.Series(normal_merged_atleast2, name="gene").to_csv(
    recode_out_dir / "Normal_RECODE_sig_genes_atleast2.txt", index=False
)

Normal_chunk1 significant genes: 11564
Normal_chunk2 significant genes: 11607
Normal_chunk3 significant genes: 11565
Normal_chunk4 significant genes: 11536
Normal_chunk5 significant genes: 11525
Normal_chunk6 significant genes: 11529
Normal_chunk7 significant genes: 11708
Normal_chunk8 significant genes: 11605
Normal_chunk9 significant genes: 13371
Normal union HVGs: 18198
Normal HVGs significant in ≥2 chunks: 15334


what i see?
- Each chunk: ~11.5k–13.3k significant genes
- → consistent per‑chunk HVG load, no crazy outlier.

- Union across all 9 chunks: 18,198 genes
- → many genes are repeatedly called significant, but each chunk also contributes some unique ones.

- Significant in ≥ 2 chunks: 15,334 genes
- → the vast majority of HVGs are reproducible across multiple chunks, which is exactly what you want for stability.

now i will build the final WGCNA matrix for Normal sample

In [7]:
normal_path = base / "results" / "adata_normal_epithelial_improved.h5ad"
adata_normal = sc.read_h5ad(normal_path)
adata_normal.obs_names_make_unique()

merged_genes = pd.read_csv(
    recode_out_dir / "Normal_RECODE_sig_genes_atleast2.txt"
)["gene"].tolist()

adata_normal_hvg = adata_normal[:, merged_genes].copy()

normal_wgcna_path = recode_out_dir / "Normal_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_normal_hvg.X.toarray() if hasattr(adata_normal_hvg.X, "toarray") else adata_normal_hvg.X,
    index=adata_normal_hvg.obs_names,
    columns=adata_normal_hvg.var_names,
).to_csv(normal_wgcna_path)
print("Saved Normal WGCNA input:", normal_wgcna_path)

Saved Normal WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/Normal_RECODE_sig_genes_cellsxgenes_forWGCNA.csv


seems alright as for now, now i will repeat the same procedure with er_postive sample since it is the second big dataset with 91908 cells

# ER_Positive

In [8]:
# ==== CREATE AND SAVE ER_Positive CHUNKS (ONE-TIME) ====================
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique()
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

def save_adata_chunks(adata, label: str, chunk_size: int = 10000, out_dir: Path = er_chunk_dir):
    """
    Shuffle cells, split into disjoint chunks, save each as .h5ad.
    """
    np.random.seed(0)
    idx = np.arange(adata.n_obs)
    np.random.shuffle(idx)

    out_dir.mkdir(exist_ok=True, parents=True)
    n_chunks = 0
    for start in range(0, adata.n_obs, chunk_size):
        end = min(start + chunk_size, adata.n_obs)
        sel = np.sort(idx[start:end])
        chunk = adata[sel].copy()
        n_chunks += 1
        chunk_label = f"{label}_chunk{n_chunks}"
        chunk_path = out_dir / f"{chunk_label}.h5ad"
        chunk.write(chunk_path)
        print(f"Saved {chunk_label}: {chunk.n_obs} cells → {chunk_path}")
        del chunk
    print(f"Total chunks for {label}: {n_chunks}")

save_adata_chunks(adata_er, label="ER_Positive", chunk_size=10000, out_dir=er_chunk_dir)

ER_Positive: 91908 cells, 33514 genes
Saved ER_Positive_chunk1: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk1.h5ad
Saved ER_Positive_chunk2: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk2.h5ad
Saved ER_Positive_chunk3: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk3.h5ad
Saved ER_Positive_chunk4: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk4.h5ad
Saved ER_Positive_chunk5: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk5.h5ad
Saved ER_Positive_chunk6: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk6.h5ad
Saved ER_Positive_chunk7: 10000 cells → /triumvirate/home/alexarol/b

In [9]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique()
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)
# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)
# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk1 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk1.h5ad"
adata_er_chunk1 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk1:", adata_er_chunk1.n_obs, "cells,", adata_er_chunk1.n_vars, "genes")

_, er_chunk1_sig_genes, er_chunk1_stats = run_recode_on_adata(
    adata=adata_er_chunk1,
    label="ER_Positive_chunk1",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk1.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk1: 10000 cells, 33514 genes
[ER_Positive_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11390, '#non-significant genes': 9381, '#silent genes': 9446, 'ell': 75, 'Elapsed time': '0h 0m 28s 578ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11390
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk1_RECODE_gene_stats.csv


In [4]:
# ==== CREATE AND SAVE ER_Positive CHUNKS (ONE-TIME) ====================
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique()
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)
# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk2 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk2.h5ad"
adata_er_chunk2 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk2:", adata_er_chunk2.n_obs, "cells,", adata_er_chunk2.n_vars, "genes")

_, er_chunk2_sig_genes, er_chunk2_stats = run_recode_on_adata(
    adata=adata_er_chunk2,
    label="ER_Positive_chunk2",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk2.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk2: 10000 cells, 33514 genes
[ER_Positive_chunk2] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11321, '#non-significant genes': 9494, '#silent genes': 9425, 'ell': 78, 'Elapsed time': '0h 0m 27s 827ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11321
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk2_RECODE_gene_stats.csv


In [4]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk3 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk3.h5ad"
adata_er_chunk3 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk3:", adata_er_chunk3.n_obs, "cells,", adata_er_chunk3.n_vars, "genes")

_, er_chunk3_sig_genes, er_chunk3_stats = run_recode_on_adata(
    adata=adata_er_chunk3,
    label="ER_Positive_chunk3",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk3.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk3: 10000 cells, 33514 genes
[ER_Positive_chunk3] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11387, '#non-significant genes': 9396, '#silent genes': 9442, 'ell': 71, 'Elapsed time': '0h 0m 26s 814ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11387
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk3_RECODE_gene_stats.csv


In [4]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk4 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk4.h5ad"
adata_er_chunk4 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk4:", adata_er_chunk4.n_obs, "cells,", adata_er_chunk4.n_vars, "genes")

_, er_chunk4_sig_genes, er_chunk4_stats = run_recode_on_adata(
    adata=adata_er_chunk4,
    label="ER_Positive_chunk4",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk4.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk4: 10000 cells, 33514 genes
[ER_Positive_chunk4] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11438, '#non-significant genes': 9269, '#silent genes': 9406, 'ell': 72, 'Elapsed time': '0h 0m 27s 421ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11438
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk4_RECODE_gene_stats.csv


In [4]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk5 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk5.h5ad"
adata_er_chunk5 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk5:", adata_er_chunk5.n_obs, "cells,", adata_er_chunk5.n_vars, "genes")

_, er_chunk5_sig_genes, er_chunk5_stats = run_recode_on_adata(
    adata=adata_er_chunk5,
    label="ER_Positive_chunk5",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk5.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk5: 10000 cells, 33514 genes
[ER_Positive_chunk5] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11355, '#non-significant genes': 9465, '#silent genes': 9380, 'ell': 77, 'Elapsed time': '0h 0m 25s 551ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11355
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk5_RECODE_gene_stats.csv


In [4]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk6 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk6.h5ad"
adata_er_chunk6 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk6:", adata_er_chunk6.n_obs, "cells,", adata_er_chunk6.n_vars, "genes")

_, er_chunk6_sig_genes, er_chunk6_stats = run_recode_on_adata(
    adata=adata_er_chunk6,
    label="ER_Positive_chunk6",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk6.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk6: 10000 cells, 33514 genes
[ER_Positive_chunk6] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11485, '#non-significant genes': 9437, '#silent genes': 9385, 'ell': 75, 'Elapsed time': '0h 0m 25s 335ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11485
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk6_RECODE_gene_stats.csv


In [5]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk7 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk7.h5ad"
adata_er_chunk7 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk7:", adata_er_chunk7.n_obs, "cells,", adata_er_chunk7.n_vars, "genes")

_, er_chunk7_sig_genes, er_chunk7_stats = run_recode_on_adata(
    adata=adata_er_chunk7,
    label="ER_Positive_chunk7",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk7.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk7: 10000 cells, 33514 genes
[ER_Positive_chunk7] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11503, '#non-significant genes': 9405, '#silent genes': 9387, 'ell': 76, 'Elapsed time': '0h 0m 52s 192ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11503
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk7_RECODE_gene_stats.csv


In [6]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk8 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk8.h5ad"
adata_er_chunk8 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk8:", adata_er_chunk8.n_obs, "cells,", adata_er_chunk8.n_vars, "genes")

_, er_chunk8_sig_genes, er_chunk8_stats = run_recode_on_adata(
    adata=adata_er_chunk8,
    label="ER_Positive_chunk8",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk8.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk8: 10000 cells, 33514 genes
[ER_Positive_chunk8] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11260, '#non-significant genes': 9536, '#silent genes': 9326, 'ell': 71, 'Elapsed time': '0h 0m 41s 752ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11260
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk8_RECODE_gene_stats.csv


In [7]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk9 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk9.h5ad"
adata_er_chunk9 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk9:", adata_er_chunk9.n_obs, "cells,", adata_er_chunk9.n_vars, "genes")

_, er_chunk9_sig_genes, er_chunk9_stats = run_recode_on_adata(
    adata=adata_er_chunk9,
    label="ER_Positive_chunk9",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk9.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk9: 10000 cells, 33514 genes
[ER_Positive_chunk9] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11233, '#non-significant genes': 9393, '#silent genes': 9444, 'ell': 75, 'Elapsed time': '0h 0m 39s 131ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11233
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk9_RECODE_gene_stats.csv


In [8]:
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique() 
print("ER_Positive:", adata_er.n_obs, "cells,", adata_er.n_vars, "genes")

# define subfolder for ER_Positive chunks
er_chunk_dir = recode_out_dir / "er_positive"
er_chunk_dir.mkdir(exist_ok=True, parents=True)

# folder where ER_Positive chunks and outputs live
er_chunk_dir = recode_out_dir / "er_positive"

# ==== RECODE ON ER_Positive_chunk10 =====================================
chunk_path = er_chunk_dir / "ER_Positive_chunk10.h5ad"
adata_er_chunk10 = sc.read_h5ad(chunk_path)
print("ER_Positive_chunk10:", adata_er_chunk10.n_obs, "cells,", adata_er_chunk10.n_vars, "genes")

_, er_chunk10_sig_genes, er_chunk10_stats = run_recode_on_adata(
    adata=adata_er_chunk10,
    label="ER_Positive_chunk10",
    out_dir=er_chunk_dir,              # save stats into er_positive subfolder
    max_cells_for_learning=adata_er_chunk10.n_obs,
    subsample_by=None,
    random_state=0,
)

ER_Positive: 91908 cells, 33514 genes
ER_Positive_chunk10: 1908 cells, 33514 genes
[ER_Positive_chunk10] RECODE setup
  Cells: 1,908, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11134, '#non-significant genes': 9477, '#silent genes': 12903, 'ell': 73, 'Elapsed time': '0h 0m 8s 796ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 11134
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk10_RECODE_gene_stats.csv


In [9]:
from collections import Counter
import pandas as pd
from pathlib import Path

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"
er_chunk_dir = recode_out_dir / "er_positive"

# list all ER_Positive chunks you processed
chunk_labels = [
    "ER_Positive_chunk1",
    "ER_Positive_chunk2",
    "ER_Positive_chunk3",
    "ER_Positive_chunk4",
    "ER_Positive_chunk5",
    "ER_Positive_chunk6",
    "ER_Positive_chunk7",
    "ER_Positive_chunk8",
    "ER_Positive_chunk9",
    "ER_Positive_chunk10",
]  

all_sig_lists = []

for lab in chunk_labels:
    stats_path = er_chunk_dir / f"{lab}_RECODE_gene_stats.csv"
    df = pd.read_csv(stats_path)
    sig_genes = df.loc[df["significance_RECODE"].astype(str) == "significant", "gene"].tolist()
    all_sig_lists.append(sig_genes)
    print(lab, "significant genes:", len(sig_genes))

gene_counter = Counter()
for sig_list in all_sig_lists:
    gene_counter.update(sig_list)

er_merged_union = sorted(gene_counter.keys())
er_merged_atleast2 = sorted([g for g, c in gene_counter.items() if c >= 2])

print("ER_Positive union HVGs:", len(er_merged_union))
print("ER_Positive HVGs significant in ≥2 chunks:", len(er_merged_atleast2))

# save merged lists into er_positive folder
pd.Series(er_merged_union, name="gene").to_csv(
    er_chunk_dir / "ER_Positive_RECODE_sig_genes_union.txt", index=False
)
pd.Series(er_merged_atleast2, name="gene").to_csv(
    er_chunk_dir / "ER_Positive_RECODE_sig_genes_atleast2.txt", index=False
)

ER_Positive_chunk1 significant genes: 11390
ER_Positive_chunk2 significant genes: 11321
ER_Positive_chunk3 significant genes: 11387
ER_Positive_chunk4 significant genes: 11438
ER_Positive_chunk5 significant genes: 11355
ER_Positive_chunk6 significant genes: 11485
ER_Positive_chunk7 significant genes: 11503
ER_Positive_chunk8 significant genes: 11260
ER_Positive_chunk9 significant genes: 11233
ER_Positive_chunk10 significant genes: 11134
ER_Positive union HVGs: 20368
ER_Positive HVGs significant in ≥2 chunks: 17344


In [10]:
from pathlib import Path
import scanpy as sc
import pandas as pd

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"
er_chunk_dir = recode_out_dir / "er_positive"

# load full ER_Positive AnnData
er_path = base / "results" / "adata_er_positive_epithelial_improved.h5ad"
adata_er = sc.read_h5ad(er_path)
adata_er.obs_names_make_unique()

# load merged HVG list (≥2 chunks) from er_positive subfolder
merged_genes_er = pd.read_csv(
    er_chunk_dir / "ER_Positive_RECODE_sig_genes_atleast2.txt"
)["gene"].tolist()

# subset full ER_Positive to these HVGs
adata_er_hvg = adata_er[:, merged_genes_er].copy()

# export WGCNA input: cells × genes
er_wgcna_path = er_chunk_dir / "ER_Positive_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_er_hvg.X.toarray() if hasattr(adata_er_hvg.X, "toarray") else adata_er_hvg.X,
    index=adata_er_hvg.obs_names,
    columns=adata_er_hvg.var_names,
).to_csv(er_wgcna_path)

print("Saved ER_Positive WGCNA input:", er_wgcna_path)

Saved ER_Positive WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_RECODE_sig_genes_cellsxgenes_forWGCNA.csv


# HER2_Positive

In [4]:
base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

sample_label = "HER2_Positive"
sample_path = base / "results" / "adata_her2_positive_epithelial_improved.h5ad"

adata_her2 = sc.read_h5ad(sample_path)
adata_her2.obs_names_make_unique()
print(sample_label, ":", adata_her2.n_obs, "cells,", adata_her2.n_vars, "genes")

her2_chunk_dir = recode_out_dir / "her2_positive"
her2_chunk_dir.mkdir(exist_ok=True, parents=True)

def save_adata_chunks(adata, label: str, chunk_size: int, out_dir: Path):
    np.random.seed(0)
    idx = np.arange(adata.n_obs)
    np.random.shuffle(idx)

    n_chunks = 0
    for start in range(0, adata.n_obs, chunk_size):
        end = min(start + chunk_size, adata.n_obs)
        sel = np.sort(idx[start:end])
        chunk = adata[sel].copy()
        n_chunks += 1
        chunk_label = f"{label}_chunk{n_chunks}"
        chunk_path = out_dir / f"{chunk_label}.h5ad"
        chunk.write(chunk_path)
        print(f"Saved {chunk_label}: {chunk.n_obs} cells → {chunk_path}")
        del chunk
    print(f"Total chunks for {label}: {n_chunks}")

# choose chunk_size so you get ~2 chunks
save_adata_chunks(adata_her2, label=sample_label, chunk_size=10000, out_dir=her2_chunk_dir)

HER2_Positive : 19693 cells, 33514 genes
Saved HER2_Positive_chunk1: 10000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk1.h5ad
Saved HER2_Positive_chunk2: 9693 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk2.h5ad
Total chunks for HER2_Positive: 2


In [5]:
chunk_path = her2_chunk_dir / "HER2_Positive_chunk1.h5ad"
adata_her2_c1 = sc.read_h5ad(chunk_path)
print("HER2_Positive_chunk1:", adata_her2_c1.n_obs, "cells,", adata_her2_c1.n_vars, "genes")

_, her2_c1_sig_genes, her2_c1_stats = run_recode_on_adata(
    adata=adata_her2_c1,
    label="HER2_Positive_chunk1",
    out_dir=her2_chunk_dir,
    max_cells_for_learning=adata_her2_c1.n_obs,
    subsample_by=None,
    random_state=0,
)

HER2_Positive_chunk1: 10000 cells, 33514 genes
[HER2_Positive_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 12519, '#non-significant genes': 9261, '#silent genes': 8431, 'ell': 85, 'Elapsed time': '0h 0m 25s 113ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 12519
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk1_RECODE_gene_stats.csv


In [5]:
base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

sample_label = "HER2_Positive"
sample_path = base / "results" / "adata_her2_positive_epithelial_improved.h5ad"

adata_her2 = sc.read_h5ad(sample_path)
adata_her2.obs_names_make_unique()
print(sample_label, ":", adata_her2.n_obs, "cells,", adata_her2.n_vars, "genes")

her2_chunk_dir = recode_out_dir / "her2_positive"
her2_chunk_dir.mkdir(exist_ok=True, parents=True)

chunk_path = her2_chunk_dir / "HER2_Positive_chunk2.h5ad"
adata_her2_c2 = sc.read_h5ad(chunk_path)
print("HER2_Positive_chunk2:", adata_her2_c2.n_obs, "cells,", adata_her2_c2.n_vars, "genes")

_, her2_c2_sig_genes, her2_c2_stats = run_recode_on_adata(
    adata=adata_her2_c2,
    label="HER2_Positive_chunk2",
    out_dir=her2_chunk_dir,
    max_cells_for_learning=adata_her2_c2.n_obs,
    subsample_by=None,
    random_state=0,
)

HER2_Positive : 19693 cells, 33514 genes
HER2_Positive_chunk2: 9693 cells, 33514 genes
[HER2_Positive_chunk2] RECODE setup
  Cells: 9,693, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 17974, '#non-significant genes': 6973, '#silent genes': 8567, 'ell': 210, 'Elapsed time': '0h 1m 33s 688ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 17974
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk2_RECODE_gene_stats.csv


In [7]:
from collections import Counter
import pandas as pd

chunk_labels = ["HER2_Positive_chunk1", "HER2_Positive_chunk2"]

all_sig_lists = []
for lab in chunk_labels:
    stats_path = her2_chunk_dir / f"{lab}_RECODE_gene_stats.csv"
    df = pd.read_csv(stats_path)
    sig_genes = df.loc[df["significance_RECODE"].astype(str) == "significant", "gene"].tolist()
    all_sig_lists.append(sig_genes)
    print(lab, "significant genes:", len(sig_genes))

gene_counter = Counter()
for sig_list in all_sig_lists:
    gene_counter.update(sig_list)

her2_union = sorted(gene_counter.keys())
her2_atleast2 = sorted([g for g, c in gene_counter.items() if c >= 2])

print("HER2_Positive union HVGs:", len(her2_union))
print("HER2_Positive HVGs significant in ≥2 chunks:", len(her2_atleast2))

pd.Series(her2_union, name="gene").to_csv(
    her2_chunk_dir / "HER2_Positive_RECODE_sig_genes_union.txt", index=False
)
pd.Series(her2_atleast2, name="gene").to_csv(
    her2_chunk_dir / "HER2_Positive_RECODE_sig_genes_atleast2.txt", index=False
)

HER2_Positive_chunk1 significant genes: 12519
HER2_Positive_chunk2 significant genes: 17974
HER2_Positive union HVGs: 18892
HER2_Positive HVGs significant in ≥2 chunks: 11601


In [4]:
from pathlib import Path
import scanpy as sc
import pandas as pd

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

her2_chunk_dir = recode_out_dir / "her2_positive"

her2_path = base / "results" / "adata_her2_positive_epithelial_improved.h5ad"
adata_her2 = sc.read_h5ad(her2_path)
adata_her2.obs_names_make_unique()

# 2) Load merged HVG list for HER2 (≥2 chunks)
merged_genes_her2 = pd.read_csv(
    her2_chunk_dir / "HER2_Positive_RECODE_sig_genes_atleast2.txt"
)["gene"].tolist()

# 3) Subset full HER2 to these HVGs
adata_her2_hvg = adata_her2[:, merged_genes_her2].copy()

# 4) Export WGCNA input: cells × genes
her2_wgcna_path = her2_chunk_dir / "HER2_Positive_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"

pd.DataFrame(
    adata_her2_hvg.X.toarray() if hasattr(adata_her2_hvg.X, "toarray") else adata_her2_hvg.X,
    index=adata_her2_hvg.obs_names,
    columns=adata_her2_hvg.var_names,
).to_csv(her2_wgcna_path)

print("Saved HER2_Positive WGCNA input:", her2_wgcna_path)

Saved HER2_Positive WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_RECODE_sig_genes_cellsxgenes_forWGCNA.csv


# Triple_Negative

In [5]:
from pathlib import Path
import scanpy as sc
import pandas as pd

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

tn_out_dir = recode_out_dir / "tn"
tn_out_dir.mkdir(exist_ok=True, parents=True)

sample_label = "TN"
sample_path = base / "results" / "adata_triplenegative_epithelial_improved.h5ad"

adata_tn = sc.read_h5ad(sample_path)
adata_tn.obs_names_make_unique()
print(sample_label, ":", adata_tn.n_obs, "cells,", adata_tn.n_vars, "genes")

# Run RECODE on all cells (no chunking)
_, tn_sig_genes, tn_gene_stats = run_recode_on_adata(
    adata=adata_tn,
    label=sample_label,
    out_dir=tn_out_dir,
    max_cells_for_learning=adata_tn.n_obs,
    subsample_by=None,
    random_state=0,
)

TN : 7561 cells, 33514 genes
[TN] RECODE setup
  Cells: 7,561, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 16380, '#non-significant genes': 7234, '#silent genes': 9900, 'ell': 133, 'Elapsed time': '0h 0m 59s 796ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 16380
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_gene_stats.csv


ValueError: too many values to unpack (expected 3)

In [8]:
result = run_recode_on_adata(
    adata=adata_tn,
    label=sample_label,
    out_dir=tn_out_dir,
    max_cells_for_learning=adata_tn.n_obs,
    subsample_by=None,
    random_state=0,
)
print(type(result))
print(result)

[TN] RECODE setup
  Cells: 7,561, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 16380, '#non-significant genes': 7234, '#silent genes': 9900, 'ell': 133, 'Elapsed time': '0h 1m 2s 506ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 16380
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_gene_stats.csv
<class 'tuple'>
(AnnData object with n_obs × n_vars = 7561 × 33514
    obs: 'barcode', 'sample_name', 'sample_type', 'geo_id', 'cell_type', 'epithelial_score', 'immune_score', 'molecular_subtype'
    var: 'RECODE_noise_variance', 'RECODE_NVSN_variance', 'RECODE_significance', 'RECODE_denoised_variance', 'RECODE_means'
    uns: 'RECODE_essen

In [6]:
adata_tn_recode, tn_sig_genes = run_recode_on_adata(
    adata=adata_tn,
    label=sample_label,
    out_dir=tn_out_dir,
    max_cells_for_learning=adata_tn.n_obs,
    subsample_by=None,
    random_state=0,
)

[TN] RECODE setup
  Cells: 7,561, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 16380, '#non-significant genes': 7234, '#silent genes': 9900, 'ell': 133, 'Elapsed time': '0h 0m 58s 993ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 16380
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_gene_stats.csv


ValueError: too many values to unpack (expected 2)

In [6]:
from collections import Counter

# tn_gene_stats is the per-gene stats DataFrame returned by run_recode_on_adata
# Make sure 'significance_RECODE' and 'gene' columns are present
print(tn_gene_stats.columns)

# All significant genes (no chunking, so one list)
tn_sig_list = tn_gene_stats.loc[
    tn_gene_stats["significance_RECODE"].astype(str) == "significant",
    "gene"
].tolist()

print("TN significant genes:", len(tn_sig_list))

# Mimic the 'atleast2' logic:
# since there is no chunking, count==1 for all significant genes,
# but this still produces the correctly named files and keeps your workflow parallel.

gene_counter = Counter(tn_sig_list)

tn_union = sorted(gene_counter.keys())
tn_atleast2 = sorted([g for g, c in gene_counter.items() if c >= 2])

print("TN union HVGs:", len(tn_union))
print("TN HVGs significant in ≥2 chunks (will usually be 0 without chunking):", len(tn_atleast2))

# Save to the same directory and naming scheme as HER2
pd.Series(tn_union, name="gene").to_csv(
    tn_out_dir / "TN_RECODE_sig_genes_union.txt", index=False
)
pd.Series(tn_atleast2, name="gene").to_csv(
    tn_out_dir / "TN_RECODE_sig_genes_atleast2.txt", index=False
)
print("Saved TN_RECODE_sig_genes_union.txt and TN_RECODE_sig_genes_atleast2.txt")

NameError: name 'tn_gene_stats' is not defined

In [11]:
adata_tn_hvg = adata_tn[:, tn_sig_genes].copy()

tn_wgcna_path = tn_out_dir / "TN_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_tn_hvg.X.toarray() if hasattr(adata_tn_hvg.X, "toarray") else adata_tn_hvg.X,
    index=adata_tn_hvg.obs_names,
    columns=adata_tn_hvg.var_names,
).to_csv(tn_wgcna_path)

print("Saved TN WGCNA input:", tn_wgcna_path)

Saved TN WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_sig_genes_cellsxgenes_forWGCNA.csv


# Triple_Negative_BRCA1

In [14]:
from pathlib import Path
import numpy as np
import scanpy as sc

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

sample_label = "TN_BRCA1"
sample_path = base / "results" / "adata_triplenegative_brca1_epithelial_improved.h5ad"

adata_tn_brca1 = sc.read_h5ad(sample_path)
adata_tn_brca1.obs_names_make_unique()
print(sample_label, ":", adata_tn_brca1.n_obs, "cells,", adata_tn_brca1.n_vars, "genes")

tn_brca1_dir = recode_out_dir / "tn_brca1"
tn_brca1_dir.mkdir(exist_ok=True, parents=True)

def save_adata_chunks(adata, label: str, chunk_size: int, out_dir: Path):
    np.random.seed(0)
    idx = np.arange(adata.n_obs)
    np.random.shuffle(idx)

    n_chunks = 0
    for start in range(0, adata.n_obs, chunk_size):
        end = min(start + chunk_size, adata.n_obs)
        sel = np.sort(idx[start:end])
        chunk = adata[sel].copy()
        n_chunks += 1
        chunk_label = f"{label}_chunk{n_chunks}"
        chunk_path = out_dir / f"{chunk_label}.h5ad"
        chunk.write(chunk_path)
        print(f"Saved {chunk_label}: {chunk.n_obs} cells → {chunk_path}")
        del chunk
    print(f"Total chunks for {label}: {n_chunks}")

# ~2 chunks of about 7k cells
save_adata_chunks(adata_tn_brca1, label=sample_label, chunk_size=7000, out_dir=tn_brca1_dir)

TN_BRCA1 : 14186 cells, 33514 genes
Saved TN_BRCA1_chunk1: 7000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk1.h5ad
Saved TN_BRCA1_chunk2: 7000 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk2.h5ad
Saved TN_BRCA1_chunk3: 186 cells → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk3.h5ad
Total chunks for TN_BRCA1: 3


In [15]:
chunk_path = tn_brca1_dir / "TN_BRCA1_chunk1.h5ad"
adata_tn_brca1_c1 = sc.read_h5ad(chunk_path)
print("TN_BRCA1_chunk1:", adata_tn_brca1_c1.n_obs, "cells,", adata_tn_brca1_c1.n_vars, "genes")

_, tn_brca1_c1_sig, tn_brca1_c1_stats = run_recode_on_adata(
    adata=adata_tn_brca1_c1,
    label="TN_BRCA1_chunk1",
    out_dir=tn_brca1_dir,
    max_cells_for_learning=adata_tn_brca1_c1.n_obs,
    subsample_by=None,
    random_state=0,
)

TN_BRCA1_chunk1: 7000 cells, 33514 genes
[TN_BRCA1_chunk1] RECODE setup
  Cells: 7,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 15877, '#non-significant genes': 8406, '#silent genes': 9231, 'ell': 113, 'Elapsed time': '0h 1m 2s 002ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 15877
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk1_RECODE_gene_stats.csv


In [16]:
chunk_path = tn_brca1_dir / "TN_BRCA1_chunk2.h5ad"
adata_tn_brca1_c2 = sc.read_h5ad(chunk_path)
print("TN_BRCA1_chunk2:", adata_tn_brca1_c2.n_obs, "cells,", adata_tn_brca1_c2.n_vars, "genes")

_, tn_brca1_c2_sig, tn_brca1_c2_stats = run_recode_on_adata(
    adata=adata_tn_brca1_c2,
    label="TN_BRCA1_chunk2",
    out_dir=tn_brca1_dir,
    max_cells_for_learning=adata_tn_brca1_c2.n_obs,
    subsample_by=None,
    random_state=0,
)

TN_BRCA1_chunk2: 7000 cells, 33514 genes
[TN_BRCA1_chunk2] RECODE setup
  Cells: 7,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 15884, '#non-significant genes': 8369, '#silent genes': 9261, 'ell': 119, 'Elapsed time': '0h 0m 57s 576ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 15884
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk2_RECODE_gene_stats.csv


In [17]:
chunk_path = tn_brca1_dir / "TN_BRCA1_chunk3.h5ad"
adata_tn_brca1_c3 = sc.read_h5ad(chunk_path)
print("TN_BRCA1_chunk3:", adata_tn_brca1_c3.n_obs, "cells,", adata_tn_brca1_c3.n_vars, "genes")

_, tn_brca1_c3_sig, tn_brca1_c3_stats = run_recode_on_adata(
    adata=adata_tn_brca1_c3,
    label="TN_BRCA1_chunk3",
    out_dir=tn_brca1_dir,
    max_cells_for_learning=adata_tn_brca1_c3.n_obs,
    subsample_by=None,
    random_state=0,
)

TN_BRCA1_chunk3: 186 cells, 33514 genes
[TN_BRCA1_chunk3] RECODE setup
  Cells: 186, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 7134, '#non-significant genes': 9429, '#silent genes': 16951, 'ell': 12, 'Elapsed time': '0h 0m 0s 565ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 7134
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk3_RECODE_gene_stats.csv


In [18]:
from collections import Counter
import pandas as pd

chunk_labels = ["TN_BRCA1_chunk1", "TN_BRCA1_chunk2", "TN_BRCA1_chunk3"]

all_sig_lists = []
for lab in chunk_labels:
    stats_path = tn_brca1_dir / f"{lab}_RECODE_gene_stats.csv"
    df = pd.read_csv(stats_path)
    sig_genes = df.loc[df["significance_RECODE"].astype(str) == "significant", "gene"].tolist()
    all_sig_lists.append(sig_genes)
    print(lab, "significant genes:", len(sig_genes))

gene_counter = Counter()
for sig_list in all_sig_lists:
    gene_counter.update(sig_list)

tn_brca1_union = sorted(gene_counter.keys())
tn_brca1_atleast2 = sorted([g for g, c in gene_counter.items() if c >= 2])

print("TN_BRCA1 union HVGs:", len(tn_brca1_union))
print("TN_BRCA1 HVGs significant in ≥2 chunks:", len(tn_brca1_atleast2))

pd.Series(tn_brca1_union, name="gene").to_csv(
    tn_brca1_dir / "TN_BRCA1_RECODE_sig_genes_union.txt", index=False
)
pd.Series(tn_brca1_atleast2, name="gene").to_csv(
    tn_brca1_dir / "TN_BRCA1_RECODE_sig_genes_atleast2.txt", index=False
)

TN_BRCA1_chunk1 significant genes: 15877
TN_BRCA1_chunk2 significant genes: 15884
TN_BRCA1_chunk3 significant genes: 7134
TN_BRCA1 union HVGs: 18596
TN_BRCA1 HVGs significant in ≥2 chunks: 13766


In [19]:
adata_tn_brca1_hvg = adata_tn_brca1[:, tn_brca1_atleast2].copy()

tn_brca1_wgcna_path = tn_brca1_dir / "TN_BRCA1_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_tn_brca1_hvg.X.toarray() if hasattr(adata_tn_brca1_hvg.X, "toarray") else adata_tn_brca1_hvg.X,
    index=adata_tn_brca1_hvg.obs_names,
    columns=adata_tn_brca1_hvg.var_names,
).to_csv(tn_brca1_wgcna_path)

print("Saved TN_BRCA1 WGCNA input:", tn_brca1_wgcna_path)

Saved TN_BRCA1 WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_RECODE_sig_genes_cellsxgenes_forWGCNA.csv


# BRCA1_PreNeoplastic

In [20]:
from pathlib import Path
import scanpy as sc

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

preneo_out_dir = recode_out_dir / "preneoplastic"
preneo_out_dir.mkdir(exist_ok=True, parents=True)

sample_label = "BRCA1_PreNeoplastic"
sample_path = base / "results" / "adata_brca1_preneoplastic_epithelial_improved.h5ad"

adata_preneo = sc.read_h5ad(sample_path)
adata_preneo.obs_names_make_unique()
print(sample_label, ":", adata_preneo.n_obs, "cells,", adata_preneo.n_vars, "genes")

# Run RECODE on all cells (no chunking)
_, preneo_sig_genes, preneo_gene_stats = run_recode_on_adata(
    adata=adata_preneo,
    label=sample_label,
    out_dir=preneo_out_dir,
    max_cells_for_learning=adata_preneo.n_obs,
    subsample_by=None,
    random_state=0,
)

BRCA1_PreNeoplastic : 7644 cells, 33514 genes
[BRCA1_PreNeoplastic] RECODE setup
  Cells: 7,644, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 15864, '#non-significant genes': 6287, '#silent genes': 11363, 'ell': 173, 'Elapsed time': '0h 1m 0s 036ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 15864
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_gene_stats.csv


In [21]:
import pandas as pd

# adata_preneo and preneo_sig_genes are already in memory from the previous cell

adata_preneo_hvg = adata_preneo[:, preneo_sig_genes].copy()

preneo_wgcna_path = preneo_out_dir / f"{sample_label}_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_preneo_hvg.X.toarray() if hasattr(adata_preneo_hvg.X, "toarray") else adata_preneo_hvg.X,
    index=adata_preneo_hvg.obs_names,
    columns=adata_preneo_hvg.var_names,
).to_csv(preneo_wgcna_path)

print("Saved", sample_label, "WGCNA input:", preneo_wgcna_path)

Saved BRCA1_PreNeoplastic WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_sig_genes_cellsxgenes_forWGCNA.csv


now that all the samples are ready i would like to create plots as in the RECODE software to analyse the data even more

similarly to recode analysis i will use the source code and modify it for my data

I cannot call it directly now because it expects the internal RECODE object (self.X_temp, _noise_variance_stabilizing_normalization, etc.), but I can copy the logic into a standalone function that uses my AnnData and the RECODE object I already have (from when you ran RECODE on each chunk).

For this specific RECODE “Applicability” plot, using one representative chunk per sample  probably the most practical choice (kernel wont crash)

- The plot is about how well the RECODE model fits that data matrix, based on the distribution of normalized variances and mean expression.
​- I already ran RECODE on chunks because the full datasets don’t fit in memory; reproducing the exact full‑sample plot would require re‑running RECODE on the full matrices, which you cannot do without crashing.
- A well‑chosen chunk (e.g. first chunk, randomly drawn) will have very similar variance–mean structure to the full sample, so the applicability class (A/B/C) and overall pattern will be the same in practice.
- For documentation / QC: plotting one chunk per sample (e.g. Normal_chunk1, ER_Positive_chunk1, HER2_Positive_chunk1, TN, TN_BRCA1_chunk1, BRCA1_PreNeoplastic) is enough and reflects the sample well.
- For memory safety: plotting per chunk avoids ever having to construct the full RECODE internal matrices for 80–90k cells again.

# Plots

## Applicability

In [4]:
# Helper that uses an existing RECODE object and chunk AnnData
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import matplotlib
from pathlib import Path

def plot_recode_applicability_from_recode(
    recode,
    adata_chunk,
    sample_label: str,
    out_png: Path,
    figsize=(10, 5),
    ps=2,
):
    """
    Reproduce RECODE's check_applicability() for a given fitted RECODE object
    and the AnnData used to fit it (one chunk).
    """
    # This follows the RECODE source code you pasted.
    X_temp = recode.X_temp  # matrix used internally during fit
    X_scaled = X_temp / np.sum(X_temp, axis=1)[:, np.newaxis]
    X_norm = recode._noise_variance_stabilizing_normalization(X_temp)
    norm_var = np.var(X_norm, axis=0, ddof=1)
    x = np.mean(X_scaled, axis=0)
    y = norm_var
    idx_nonsig, idx_sig = y <= 1, y > 1

    fig = plt.figure(figsize=figsize)
    plt.rcParams["xtick.direction"] = "in"
    plt.rcParams["ytick.direction"] = "in"
    spec = matplotlib.gridspec.GridSpec(
        ncols=2, nrows=1, width_ratios=[4, 1], wspace=0.0
    )
    ax0 = fig.add_subplot(spec[0])
    ax0.grid(False)

    # RNA case (not Multiome)
    ax0.scatter(
        x[idx_sig],
        y[idx_sig],
        color="b",
        s=ps,
        label=f"significant {recode.unit}",
        zorder=2,
    )
    ax0.scatter(
        x[idx_nonsig],
        y[idx_nonsig],
        color="r",
        s=ps,
        label=f"non-significant {recode.unit}",
        zorder=3,
    )

    ax0.axhline(1, color="gray", ls="--", lw=2, zorder=1)
    ax0.set_xscale("log")
    ax0.set_yscale("log")
    ax0.set_title(f"Applicability – {sample_label}", fontsize=14)
    ax0.set_xlabel("Mean of scaled data", fontsize=14)
    ax0.set_ylabel("NVSN variance", fontsize=14)
    ax0.legend(loc="upper left", borderaxespad=0, fontsize=14, markerscale=5).get_frame().set_alpha(0)
    ylim = ax0.set_ylim()

    # right density panel
    ax1 = fig.add_subplot(spec[1])
    ax1.grid(False)
    sns.kdeplot(y=np.log10(norm_var[norm_var > 0]), color="k", fill=True, ax=ax1)
    ax1.axhline(0, c="gray", ls="--", lw=2, zorder=1)
    ax1.axvline(0, c="k", ls="-", lw=1, zorder=1)
    ax1.set_ylim(np.log10(ax0.set_ylim()))
    ax1.tick_params(labelbottom=True, labelleft=False, bottom=True)
    ax1.set_xlabel("Density", fontsize=14)
    ax1.spines["right"].set_visible(False)
    ax1.spines["top"].set_visible(False)
    ax1.tick_params(left=False)
    ax1.patch.set_alpha(0)

    # applicability classification (optional, same as RECODE)
    x_d = np.linspace(ax1.set_ylim()[0], ax1.set_ylim()[1], 1000)
    dens = scipy.stats.gaussian_kde(np.log10(norm_var[norm_var > 0]))(x_d)
    peak_val = x_d[np.argmax(dens)]
    rate_low_var = np.sum(norm_var[norm_var > 0] < 0.90) / len(norm_var[norm_var > 0])

    applicability = "Unknown"
    backcolor = "w"
    if (rate_low_var < 0.01) and (np.abs(peak_val) < 0.1):
        applicability = "Class A (strongly applicable)"
        backcolor = "lightgreen"
    elif rate_low_var < 0.01:
        applicability = "Class B (weakly applicable)"
        backcolor = "yellow"
    else:
        applicability = "Class C (inapplicabile)"
        backcolor = "tomato"

    ax0.text(
        0.99,
        0.982,
        applicability,
        va="top",
        ha="right",
        transform=ax0.transAxes,
        fontsize=14,
        backgroundcolor=backcolor,
    )

    plt.tight_layout()
    fig.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print(f"Saved applicability plot for {sample_label} → {out_png}")


In [6]:
# Example: Normal_chunk1
from pathlib import Path
import screcode

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
results_dir = base / "results"
recode_out_dir = results_dir / "recode_outputs"

# load the chunk AnnData
chunk_path = recode_out_dir / "normal" / "Normal_chunk1.h5ad"
adata_chunk = sc.read_h5ad(chunk_path)

# re-fit RECODE on this chunk to get a recode object with X_temp etc.
recode = screcode.RECODE(
    assay="RNA",
    version=2,
    solver="auto",
    downsampling_rate=0.2,
    fast_algorithm=True,
    log_normalize=True,
    target_sum=1e5,
    verbose=True,
)
recode.fit(adata_chunk)   # fit, not fit_transform, to keep X_temp consistent

out_png = recode_out_dir / "normal" / "Normal_chunk1_RECODE_applicability_sourceLike.png"
plot_recode_applicability_from_recode(
    recode=recode,
    adata_chunk=adata_chunk,
    sample_label="Normal_chunk1",
    out_png=out_png,
    figsize=(10, 5),
    ps=2,
)

Saved applicability plot for Normal_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/normal/Normal_chunk1_RECODE_applicability_sourceLike.png


In [None]:
# Example: Normal_chunk1
from pathlib import Path
import screcode

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
results_dir = base / "results"
recode_out_dir = results_dir / "recode_outputs"

# load the chunk AnnData
chunk_path = recode_out_dir / "normal" / "Normal_chunk1.h5ad"
adata_chunk = sc.read_h5ad(chunk_path)

# re-fit RECODE on this chunk to get a recode object with X_temp etc.
recode = screcode.RECODE(
    assay="RNA",
    version=2,
    solver="auto",
    downsampling_rate=0.2,
    fast_algorithm=True,
    log_normalize=True,
    target_sum=1e5,
    verbose=True,
)
recode.fit(adata_chunk)   # fit, not fit_transform, to keep X_temp consistent

out_png = recode_out_dir / "normal" / "Normal_chunk1_RECODE_applicability_sourceLike.png"
plot_recode_applicability_from_recode(
    recode=recode,
    adata_chunk=adata_chunk,
    sample_label="Normal_chunk1",
    out_png=out_png,
    figsize=(10, 5),
    ps=2,
)

In [7]:
chunk_path = recode_out_dir / "er_positive" / "ER_Positive_chunk1.h5ad"
adata_chunk = sc.read_h5ad(chunk_path)

recode = screcode.RECODE(
    assay="RNA",
    version=2,
    solver="auto",
    downsampling_rate=0.2,
    fast_algorithm=True,
    log_normalize=True,
    target_sum=1e5,
    verbose=True,
)
recode.fit(adata_chunk)

out_png = recode_out_dir / "er_positive" / "ER_Positive_chunk1_RECODE_applicability_sourceLike.png"
plot_recode_applicability_from_recode(
    recode=recode,
    adata_chunk=adata_chunk,
    sample_label="ER_Positive_chunk1",
    out_png=out_png,
    figsize=(10, 5),
    ps=2,
)

Saved applicability plot for ER_Positive_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk1_RECODE_applicability_sourceLike.png


In [8]:
chunk_path = recode_out_dir / "her2_positive" / "HER2_Positive_chunk1.h5ad"
adata_chunk = sc.read_h5ad(chunk_path)

recode = screcode.RECODE(
    assay="RNA",
    version=2,
    solver="auto",
    downsampling_rate=0.2,
    fast_algorithm=True,
    log_normalize=True,
    target_sum=1e5,
    verbose=True,
)
recode.fit(adata_chunk)

out_png = recode_out_dir / "her2_positive" / "HER2_Positive_chunk1_RECODE_applicability_sourceLike.png"
plot_recode_applicability_from_recode(
    recode=recode,
    adata_chunk=adata_chunk,
    sample_label="HER2_Positive_chunk1",
    out_png=out_png,
    figsize=(10, 5),
    ps=2,
)

Saved applicability plot for HER2_Positive_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk1_RECODE_applicability_sourceLike.png


In [9]:
tn_dir = recode_out_dir / "tn"
adata_path = results_dir / "adata_triplenegative_epithelial_improved.h5ad"
adata_tn = sc.read_h5ad(adata_path)

recode = screcode.RECODE(
    assay="RNA",
    version=2,
    solver="auto",
    downsampling_rate=0.2,
    fast_algorithm=True,
    log_normalize=True,
    target_sum=1e5,
    verbose=True,
)
recode.fit(adata_tn)

out_png = tn_dir / "TN_RECODE_applicability_sourceLike.png"
plot_recode_applicability_from_recode(
    recode=recode,
    adata_chunk=adata_tn,
    sample_label="TN",
    out_png=out_png,
    figsize=(10, 5),
    ps=2,
)

  utils.warn_names_duplicates("obs")


Saved applicability plot for TN → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_applicability_sourceLike.png


In [10]:
tn_brca1_dir = recode_out_dir / "tn_brca1"
chunk_path = tn_brca1_dir / "TN_BRCA1_chunk1.h5ad"
adata_chunk = sc.read_h5ad(chunk_path)

recode = screcode.RECODE(
    assay="RNA",
    version=2,
    solver="auto",
    downsampling_rate=0.2,
    fast_algorithm=True,
    log_normalize=True,
    target_sum=1e5,
    verbose=True,
)
recode.fit(adata_chunk)

out_png = tn_brca1_dir / "TN_BRCA1_chunk1_RECODE_applicability_sourceLike.png"
plot_recode_applicability_from_recode(
    recode=recode,
    adata_chunk=adata_chunk,
    sample_label="TN_BRCA1_chunk1",
    out_png=out_png,
    figsize=(10, 5),
    ps=2,
)

Saved applicability plot for TN_BRCA1_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk1_RECODE_applicability_sourceLike.png


In [11]:
preneo_dir = recode_out_dir / "preneoplastic"
adata_path = results_dir / "adata_brca1_preneoplastic_epithelial_improved.h5ad"
adata_preneo = sc.read_h5ad(adata_path)

recode = screcode.RECODE(
    assay="RNA",
    version=2,
    solver="auto",
    downsampling_rate=0.2,
    fast_algorithm=True,
    log_normalize=True,
    target_sum=1e5,
    verbose=True,
)
recode.fit(adata_preneo)

out_png = preneo_dir / "BRCA1_PreNeoplastic_RECODE_applicability_sourceLike.png"
plot_recode_applicability_from_recode(
    recode=recode,
    adata_chunk=adata_preneo,
    sample_label="BRCA1_PreNeoplastic",
    out_png=out_png,
    figsize=(10, 5),
    ps=2,
)

  utils.warn_names_duplicates("obs")


Saved applicability plot for BRCA1_PreNeoplastic → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_applicability_sourceLike.png


## PC varience modification/elimination

now i would like to recrete the PC variance modification/elimination plot from teh RECODE software, i will use RECODE source code and modify it for my data

This is a slightly customized version of plot_variance_modified_data that:
- Uses the same PCA_Ev, PCA_Ev_NRM, and ell from recode.recode_
- Adds the “PC variance elimination” line at 0 with orange points to match your screenshot.

In [5]:
def plot_pc_variance_from_recode(
    recode,
    sample_label: str,
    out_png: Path,
    figsize=(7, 3.0),
):
    """
    RECODE-like 'PC variance modification/elimination' plot
    with styling close to the original figures.
    """
    ps = 18
    fs_title = 16
    fs_label = 14

    # original eigenvalues (positive)
    plot_EV = recode.recode_.PCA_Ev[recode.recode_.PCA_Ev > 0]
    n_EV = len(plot_EV)
    x_pc = np.arange(n_EV) + 1

    # modified eigenvalues for first ell PCs
    ell = recode.recode_.ell
    plot_EV_mod = np.zeros(n_EV)
    plot_EV_mod[:ell] = recode.recode_.PCA_Ev_NRM[:ell]

    fig, ax = plt.subplots(figsize=figsize)
    plt.rcParams["xtick.direction"] = "in"
    plt.rcParams["ytick.direction"] = "in"

    # clean background
    ax.grid(False)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

    # Original (light blue triangles)
    ax.scatter(
        x_pc,
        plot_EV,
        color="lightblue",
        label="Original",
        marker="^",
        s=ps,
        zorder=1,
    )

    # PC variance modification (green dots) only up to ell
    ax.scatter(
        x_pc[:ell],
        plot_EV_mod[:ell],
        color="tab:green",
        label="PC variance modification",
        marker="o",
        s=ps,
        zorder=2,
    )

    # PC variance elimination (orange) from ell to end at a small positive value
    # choose a tiny positive constant so the line is just above the axis, like theirs
    elim_val = 1.0        # try 1.0 first; adjust later if needed
    elim_y = np.full(n_EV - ell, elim_val)
    ax.scatter(
        x_pc[ell:],
        elim_y,
        color="orange",
        label="PC variance elimination",
        marker="o",
        s=ps,
        zorder=1,
    )

    # vertical dashed line and ℓ label
    ax.set_yscale("symlog")
    ax.set_ylim(0.1, max(plot_EV) * 1.5)
    ax.axvline(ell, color="gray", ls="--")
    ax.axhline(elim_val, color="gray", ls="--") 

    # place ℓ label visibly above the axis (independent of elim_val)
    ax.text(
    ell * 1.02,          # a tiny step to the right of the vertical line
    elim_val,            # EXACTLY the same height as the orange line
    r"$\ell$=%d" % ell,
    color="k",
    fontsize=14,
    ha="left",
    va="bottom",
)

    ax.set_xlabel("PC", fontsize=fs_label)
    ax.set_ylabel("PC variance (eigenvalue)", fontsize=fs_label)
    ax.set_yscale("symlog")
    ax.set_ylim(0.1, max(plot_EV) * 1.5)

    ax.legend(
        loc="upper right",
        borderaxespad=0,
        fontsize=12,
        markerscale=1.8,
        handletextpad=0.4,
    ).get_frame().set_alpha(0)

    ax.set_title(f"PC variance modification/elimination – {sample_label}", fontsize=fs_title)

    plt.tight_layout()
    fig.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print(f"Saved PC variance plot for {sample_label} → {out_png}")

In [52]:
normal_chunk1_path = recode_out_dir / "normal" / "Normal_chunk1.h5ad"
adata_normal_chunk1 = sc.read_h5ad(normal_chunk1_path)

adata_normal_recode1, normal_sig_genes1, normal_gene_stats1, recode_normal1 = (
    run_recode_on_adata(
        adata_normal_chunk1,
        label="Normal_chunk1",
        out_dir=recode_out_dir / "normal",
    )
)

out_png_pc_normal = (
    recode_out_dir / "normal" / "Normal_chunk1_RECODE_PCvariance_sourceLike.png"
)
plot_pc_variance_from_recode(
    recode=recode_normal1,
    sample_label="Normal_chunk1",
    out_png=out_png_pc_normal,
    figsize=(7, 3.5),
)

[Normal_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11564, '#non-significant genes': 8604, '#silent genes': 9802, 'ell': 97, 'Elapsed time': '0h 0m 57s 859ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11564
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/normal/Normal_chunk1_RECODE_gene_stats.csv
Saved PC variance plot for Normal_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/normal/Normal_chunk1_RECODE_PCvariance_sourceLike.png


In [9]:
from pathlib import Path

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

# 1) Load ER_Positive chunk 1
er_chunk1_path = recode_out_dir / "er_positive" / "ER_Positive_chunk1.h5ad"
adata_erpos_chunk1 = sc.read_h5ad(er_chunk1_path)

# 2) Run RECODE with the same settings as for Normal_chunk1
adata_erpos_recode1, erpos_sig_genes1, erpos_gene_stats1, recode_erpos1 = (
    run_recode_on_adata(
        adata=adata_erpos_chunk1,
        label="ER_Positive_chunk1",
        out_dir=recode_out_dir / "er_positive",
        max_cells_for_learning=adata_erpos_chunk1.n_obs,  # no subsampling
        subsample_by=None,
        random_state=0,
    )
)

# 3) PC variance modification/elimination plot
out_png_pc_erpos = (
    recode_out_dir / "er_positive" / "ER_Positive_chunk1_RECODE_PCvariance_sourceLike.png"
)

plot_pc_variance_from_recode(
    recode=recode_erpos1,
    sample_label="ER_Positive_chunk1",
    out_png=out_png_pc_erpos,
    figsize=(7, 3.5),
)

[ER_Positive_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11390, '#non-significant genes': 9381, '#silent genes': 9446, 'ell': 75, 'Elapsed time': '0h 0m 24s 096ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11390
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk1_RECODE_gene_stats.csv
Saved PC variance plot for ER_Positive_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk1_RECODE_PCvariance_sourceLike.png


In [6]:
from pathlib import Path
import scanpy as sc

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

# 1) Load HER2_Positive chunk 1
her2_chunk1_path = recode_out_dir / "her2_positive" / "HER2_Positive_chunk1.h5ad"
adata_her2_chunk1 = sc.read_h5ad(her2_chunk1_path)

# 2) Run RECODE with the same settings as for Normal_chunk1
adata_her2_recode1, her2_sig_genes1, her2_gene_stats1, recode_her2_1 = (
    run_recode_on_adata(
        adata=adata_her2_chunk1,
        label="HER2_Positive_chunk1",
        out_dir=recode_out_dir / "her2_positive",
        max_cells_for_learning=adata_her2_chunk1.n_obs,  # no subsampling
        subsample_by=None,
        random_state=0,
    )
)

# 3) PC variance modification/elimination plot
out_png_pc_her2 = (
    recode_out_dir / "her2_positive" / "HER2_Positive_chunk1_RECODE_PCvariance_sourceLike.png"
)

plot_pc_variance_from_recode(
    recode=recode_her2_1,
    sample_label="HER2_Positive_chunk1",
    out_png=out_png_pc_her2,
    figsize=(7, 3.5),
)

[HER2_Positive_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 12519, '#non-significant genes': 9261, '#silent genes': 8431, 'ell': 85, 'Elapsed time': '0h 0m 24s 905ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 12519
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk1_RECODE_gene_stats.csv
Saved PC variance plot for HER2_Positive_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk1_RECODE_PCvariance_sourceLike.png


In [7]:
from pathlib import Path
import scanpy as sc
import pandas as pd

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

tn_out_dir = recode_out_dir / "tn"
tn_out_dir.mkdir(exist_ok=True, parents=True)

sample_label = "TN"
sample_path = base / "results" / "adata_triplenegative_epithelial_improved.h5ad"

adata_tn = sc.read_h5ad(sample_path)
adata_tn.obs_names_make_unique()
print(sample_label, ":", adata_tn.n_obs, "cells,", adata_tn.n_vars, "genes")

# Run RECODE on all cells (no chunking), now capturing 'recode_tn'
adata_tn_recode, tn_sig_genes, tn_gene_stats, recode_tn = run_recode_on_adata(
    adata=adata_tn,
    label=sample_label,
    out_dir=tn_out_dir,
    max_cells_for_learning=adata_tn.n_obs,
    subsample_by=None,
    random_state=0,
)

adata_tn_hvg = adata_tn[:, tn_sig_genes].copy()

tn_wgcna_path = tn_out_dir / "TN_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_tn_hvg.X.toarray() if hasattr(adata_tn_hvg.X, "toarray") else adata_tn_hvg.X,
    index=adata_tn_hvg.obs_names,
    columns=adata_tn_hvg.var_names,
).to_csv(tn_wgcna_path)
print("Saved TN WGCNA input:", tn_wgcna_path)

# PC variance modification/elimination plot for TN
out_png_pc_tn = tn_out_dir / "TN_RECODE_PCvariance_sourceLike.png"
plot_pc_variance_from_recode(
    recode=recode_tn,
    sample_label="TN",
    out_png=out_png_pc_tn,
    figsize=(7, 3.5),
)

TN : 7561 cells, 33514 genes
[TN] RECODE setup
  Cells: 7,561, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 16380, '#non-significant genes': 7234, '#silent genes': 9900, 'ell': 133, 'Elapsed time': '0h 1m 40s 291ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 16380
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_gene_stats.csv
Saved TN WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_sig_genes_cellsxgenes_forWGCNA.csv
Saved PC variance plot for TN → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_PCvariance_sourceLike.png


In [6]:
import scanpy as sc
from pathlib import Path

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

# 1) Load TN_BRCA1 chunk 1
tnbrca1_chunk1_path = recode_out_dir / "tn_brca1" / "TN_BRCA1_chunk1.h5ad"
adata_tnbrca1_chunk1 = sc.read_h5ad(tnbrca1_chunk1_path)

# 2) Run RECODE with same settings (no subsampling)
adata_tnbrca1_recode1, tnbrca1_sig_genes1, tnbrca1_gene_stats1, recode_tnbrca1_1 = (
    run_recode_on_adata(
        adata=adata_tnbrca1_chunk1,
        label="TN_BRCA1_chunk1",
        out_dir=recode_out_dir / "tn_brca1",
        max_cells_for_learning=adata_tnbrca1_chunk1.n_obs,  # use all cells
        subsample_by=None,
        random_state=0,
    )
)

# 3) PC variance modification/elimination plot
out_png_pc_tnbrca1 = (
    recode_out_dir / "tn_brca1" / "TN_BRCA1_chunk1_RECODE_PCvariance_sourceLike.png"
)

plot_pc_variance_from_recode(
    recode=recode_tnbrca1_1,
    sample_label="TN_BRCA1_chunk1",
    out_png=out_png_pc_tnbrca1,
    figsize=(7, 3.5),
)

[TN_BRCA1_chunk1] RECODE setup
  Cells: 7,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 15877, '#non-significant genes': 8406, '#silent genes': 9231, 'ell': 113, 'Elapsed time': '0h 0m 53s 802ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 15877
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk1_RECODE_gene_stats.csv
Saved PC variance plot for TN_BRCA1_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk1_RECODE_PCvariance_sourceLike.png


In [7]:
import scanpy as sc
from pathlib import Path

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

pre_out_dir = recode_out_dir / "preneoplastic"
pre_out_dir.mkdir(exist_ok=True, parents=True)

sample_label = "BRCA1_PreNeoplastic"
sample_path = base / "results" / "adata_brca1_preneoplastic_epithelial_improved.h5ad"  # <- check filename

# 1) Load full pre-neoplastic dataset
adata_pre = sc.read_h5ad(sample_path)
adata_pre.obs_names_make_unique()
print(sample_label, ":", adata_pre.n_obs, "cells,", adata_pre.n_vars, "genes")

# 2) Run RECODE on all cells (no subsampling), capturing 'recode_pre'
adata_pre_recode, pre_sig_genes, pre_gene_stats, recode_pre = run_recode_on_adata(
    adata=adata_pre,
    label=sample_label,
    out_dir=pre_out_dir,
    max_cells_for_learning=adata_pre.n_obs,
    subsample_by=None,
    random_state=0,
)

# 3) PC variance modification/elimination plot
out_png_pc_pre = pre_out_dir / "BRCA1_PreNeoplastic_RECODE_PCvariance_sourceLike.png"

plot_pc_variance_from_recode(
    recode=recode_pre,
    sample_label=sample_label,
    out_png=out_png_pc_pre,
    figsize=(7, 3.5),
)

BRCA1_PreNeoplastic : 7644 cells, 33514 genes
[BRCA1_PreNeoplastic] RECODE setup
  Cells: 7,644, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 15864, '#non-significant genes': 6287, '#silent genes': 11363, 'ell': 173, 'Elapsed time': '0h 1m 13s 801ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 15864
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_gene_stats.csv
Saved PC variance plot for BRCA1_PreNeoplastic → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_PCvariance_sourceLike.png


## Mean-variance plot (log-normalized data)

In [6]:
# Helper for a fitted RECODE object recode (the same one used for the PC‑variance plots)

def plot_mean_variance_from_recode(
    recode,
    sample_label: str,
    out_png: Path,
    figsize=(7, 5),
    ps=2,
    target_sum="median",
):
    """
    RECODE-like 'Mean-variance plot (log-normalized data)'.

    Left: original log-normalized data, right: RECODE-denoised log-normalized data.
    """
    import matplotlib.pyplot as plt
    import numpy as np

    # --- this logic follows RECODE's plot_mean_variance ---
    # X_temp is the raw count matrix stored in the RECODE object
    X = recode.X_temp  # cells x genes

    # log-normalize original
    if target_sum == "median":
        size_factor = np.median(np.sum(X, axis=1))
    elif target_sum == "mean":
        size_factor = np.mean(np.sum(X, axis=1))
    else:
        size_factor = float(target_sum)

    X_scaled = X / np.sum(X, axis=1)[:, np.newaxis] * size_factor
    X_log = np.log1p(X_scaled)

    mean_orig = np.mean(X_log, axis=0)
    var_orig = np.var(X_log, axis=0, ddof=1)

    # RECODE-denoised counts (X_RECODE already on same cells × nonsilent genes)
    X_rec = recode.X_RECODE[:, recode.idx_nonsilent]
    X_rec_scaled = X_rec / np.sum(X_rec, axis=1)[:, np.newaxis] * size_factor
    X_rec_log = np.log1p(X_rec_scaled)

    mean_rec = np.mean(X_rec_log, axis=0)
    var_rec = np.var(X_rec_log, axis=0, ddof=1)

    # --- plotting ---
    fig, axes = plt.subplots(1, 2, figsize=figsize, sharey=True)
    (ax0, ax1) = axes

    ax0.scatter(mean_orig, var_orig, s=ps, color="b")
    ax0.axhline(0, color="gray", ls="--", lw=2)
    ax0.set_title("Original")
    ax0.set_xlabel("Mean")
    ax0.set_ylabel("Variance")   # y-axis label

    # RECODE
    ax1.scatter(mean_rec, var_rec, s=ps, color="b")
    ax1.axhline(0, color="gray", ls="--", lw=2)
    ax1.set_title("RECODE")
    ax1.set_xlabel("Mean")
    ax1.set_ylabel("Variance")   # y-axis label

    # make sure right subplot still shows y tick *labels*
    ax1.yaxis.set_label_position("left")       # but keep the label only on the left
    ax1.tick_params(axis="y", which="both", labelleft=True, labelright=False)
    
    fig.suptitle(f"Mean-variance plot (log-normalized data) – {sample_label}")

    for ax in axes:
        ax.grid(False)
        ax.spines["right"].set_visible(False)
        ax.spines["top"].set_visible(False)

    plt.tight_layout()
    fig.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print(f"Saved mean-variance plot for {sample_label} → {out_png}")

In [9]:
normal_chunk1_path = recode_out_dir / "normal" / "Normal_chunk1.h5ad"
adata_normal_chunk1 = sc.read_h5ad(normal_chunk1_path)

adata_normal_recode1, normal_sig_genes1, normal_gene_stats1, recode_normal1 = (
    run_recode_on_adata(
        adata=adata_normal_chunk1,
        label="Normal_chunk1",
        out_dir=recode_out_dir / "normal",
        max_cells_for_learning=adata_normal_chunk1.n_obs,
        subsample_by=None,
        random_state=0,
    )
)

[Normal_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11564, '#non-significant genes': 8604, '#silent genes': 9802, 'ell': 97, 'Elapsed time': '0h 0m 23s 883ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11564
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/normal/Normal_chunk1_RECODE_gene_stats.csv


In [36]:
out_png_mv_normal = (
    recode_out_dir / "normal" / "Normal_chunk1_RECODE_mean_variance_sourceLike.png"
)

plot_mean_variance_from_recode(
    recode=recode_normal1,
    sample_label="Normal_chunk1",
    out_png=out_png_mv_normal,
    figsize=(7, 3.0),
    ps=2,
)

Saved mean-variance plot for Normal_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/normal/Normal_chunk1_RECODE_mean_variance_sourceLike.png


In [5]:
er_chunk1_path = recode_out_dir / "er_positive" / "ER_Positive_chunk1.h5ad"
adata_erpos_chunk1 = sc.read_h5ad(er_chunk1_path)

adata_erpos_recode1, erpos_sig_genes1, erpos_gene_stats1, recode_erpos1 = (
    run_recode_on_adata(
        adata=adata_erpos_chunk1,
        label="ER_Positive_chunk1",
        out_dir=recode_out_dir / "er_positive",
        max_cells_for_learning=adata_erpos_chunk1.n_obs,  # no subsampling
        subsample_by=None,
        random_state=0,
    )
)

[ER_Positive_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 11390, '#non-significant genes': 9381, '#silent genes': 9446, 'ell': 75, 'Elapsed time': '0h 0m 23s 902ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 11390
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk1_RECODE_gene_stats.csv


In [6]:
out_png_mv_erpos = (
    recode_out_dir / "er_positive" / "ER_Positive_chunk1_RECODE_mean_variance_sourceLike.png"
)

plot_mean_variance_from_recode(
    recode=recode_erpos1,             # ER+ RECODE object
    sample_label="ER_Positive_chunk1",
    out_png=out_png_mv_erpos,
    figsize=(7, 3.0),
    ps=2,
)

Saved mean-variance plot for ER_Positive_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/er_positive/ER_Positive_chunk1_RECODE_mean_variance_sourceLike.png


In [5]:
her2_chunk1_path = recode_out_dir / "her2_positive" / "HER2_Positive_chunk1.h5ad"
adata_her2_chunk1 = sc.read_h5ad(her2_chunk1_path)

adata_her2_recode1, her2_sig_genes1, her2_gene_stats1, recode_her2_1 = (
    run_recode_on_adata(
        adata=adata_her2_chunk1,
        label="HER2_Positive_chunk1",
        out_dir=recode_out_dir / "her2_positive",
        max_cells_for_learning=adata_her2_chunk1.n_obs,
        subsample_by=None,
        random_state=0,
    )
)

[HER2_Positive_chunk1] RECODE setup
  Cells: 10,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 12519, '#non-significant genes': 9261, '#silent genes': 8431, 'ell': 85, 'Elapsed time': '0h 0m 25s 228ms', 'solver': 'randomized', '#train_data': 2000}
  RECODE finished.
  Significant genes (HVGs): 12519
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk1_RECODE_gene_stats.csv


In [6]:
out_png_mv_her2 = (
    recode_out_dir
    / "her2_positive"
    / "HER2_Positive_chunk1_RECODE_mean_variance_sourceLike.png"
)

plot_mean_variance_from_recode(
    recode=recode_her2_1,              # HER2+ RECODE object
    sample_label="HER2_Positive_chunk1",
    out_png=out_png_mv_her2,
    figsize=(7, 3.0),
    ps=2,
)

Saved mean-variance plot for HER2_Positive_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/her2_positive/HER2_Positive_chunk1_RECODE_mean_variance_sourceLike.png


In [8]:
tn_out_dir = recode_out_dir / "tn"
tn_out_dir.mkdir(exist_ok=True, parents=True)

sample_label = "TN"
sample_path = base / "results" / "adata_triplenegative_epithelial_improved.h5ad"

adata_tn = sc.read_h5ad(sample_path)
adata_tn.obs_names_make_unique()
print(sample_label, ":", adata_tn.n_obs, "cells,", adata_tn.n_vars, "genes")

adata_tn_recode, tn_sig_genes, tn_gene_stats, recode_tn = run_recode_on_adata(
    adata=adata_tn,
    label=sample_label,
    out_dir=tn_out_dir,
    max_cells_for_learning=adata_tn.n_obs,  # use all cells
    subsample_by=None,
    random_state=0,
)

TN : 7561 cells, 33514 genes
[TN] RECODE setup
  Cells: 7,561, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 16380, '#non-significant genes': 7234, '#silent genes': 9900, 'ell': 133, 'Elapsed time': '0h 0m 59s 217ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 16380
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_gene_stats.csv


In [9]:
out_png_mv_tn = tn_out_dir / "TN_RECODE_mean_variance_sourceLike.png"

plot_mean_variance_from_recode(
    recode=recode_tn,
    sample_label="TN",
    out_png=out_png_mv_tn,
    figsize=(7, 3.0),
    ps=2,
)

Saved mean-variance plot for TN → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_mean_variance_sourceLike.png


In [10]:
tnbrca1_out_dir = recode_out_dir / "tn_brca1"
tnbrca1_out_dir.mkdir(exist_ok=True, parents=True)

tnbrca1_chunk1_path = tnbrca1_out_dir / "TN_BRCA1_chunk1.h5ad"
adata_tnbrca1_chunk1 = sc.read_h5ad(tnbrca1_chunk1_path)
adata_tnbrca1_chunk1.obs_names_make_unique()

adata_tnbrca1_recode1, tnbrca1_sig_genes1, tnbrca1_gene_stats1, recode_tnbrca1_1 = (
    run_recode_on_adata(
        adata=adata_tnbrca1_chunk1,
        label="TN_BRCA1_chunk1",
        out_dir=tnbrca1_out_dir,
        max_cells_for_learning=adata_tnbrca1_chunk1.n_obs,
        subsample_by=None,
        random_state=0,
    )
)

[TN_BRCA1_chunk1] RECODE setup
  Cells: 7,000, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 15877, '#non-significant genes': 8406, '#silent genes': 9231, 'ell': 113, 'Elapsed time': '0h 1m 12s 260ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 15877
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk1_RECODE_gene_stats.csv


In [11]:
out_png_mv_tnbrca1 = (
    tnbrca1_out_dir / "TN_BRCA1_chunk1_RECODE_mean_variance_sourceLike.png"
)

plot_mean_variance_from_recode(
    recode=recode_tnbrca1_1,
    sample_label="TN_BRCA1_chunk1",
    out_png=out_png_mv_tnbrca1,
    figsize=(7, 3.0),
    ps=2,
)

Saved mean-variance plot for TN_BRCA1_chunk1 → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn_brca1/TN_BRCA1_chunk1_RECODE_mean_variance_sourceLike.png


In [7]:
pre_out_dir = recode_out_dir / "preneoplastic"
pre_out_dir.mkdir(exist_ok=True, parents=True)

sample_label = "BRCA1_PreNeoplastic"
pre_path = base / "results" / "adata_brca1_preneoplastic_epithelial_improved.h5ad"  # adjust if filename differs
adata_pre = sc.read_h5ad(pre_path)
adata_pre.obs_names_make_unique()

adata_pre_recode, pre_sig_genes, pre_gene_stats, recode_pre = run_recode_on_adata(
    adata=adata_pre,
    label=sample_label,
    out_dir=pre_out_dir,
    max_cells_for_learning=adata_pre.n_obs,
    subsample_by=None,
    random_state=0,
)

[BRCA1_PreNeoplastic] RECODE setup
  Cells: 7,644, Genes: 33,514
  → Using all cells for RECODE learning (no subsampling).
  Starting RECODE.
start RECODE for scRNA-seq data
Normalized data are stored as "RECODE" in adata.layers
Normalized data are stored as "RECODE_norm" and "RECODE_log" in adata.layers
end RECODE for scRNA-seq
log: {'assay': 'RNA', '#significant genes': 15864, '#non-significant genes': 6287, '#silent genes': 11363, 'ell': 173, 'Elapsed time': '0h 0m 59s 322ms', 'solver': 'full'}
  RECODE finished.
  Significant genes (HVGs): 15864
  Saved gene stats: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_gene_stats.csv


In [8]:
out_png_mv_pre = (
    pre_out_dir / "BRCA1_PreNeoplastic_RECODE_mean_variance_sourceLike.png"
)

plot_mean_variance_from_recode(
    recode=recode_pre,
    sample_label=sample_label,
    out_png=out_png_mv_pre,
    figsize=(7, 3.0),
    ps=2,
)

Saved mean-variance plot for BRCA1_PreNeoplastic → /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_mean_variance_sourceLike.png


In [7]:
from pathlib import Path
import scanpy as sc
import pandas as pd
import numpy as np

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

tn_out_dir = recode_out_dir / "tn"
tn_out_dir.mkdir(exist_ok=True, parents=True)

# 1) Read TN gene stats produced by RECODE
tn_stats_path = tn_out_dir / "TN_RECODE_gene_stats.csv"
tn_gene_stats = pd.read_csv(tn_stats_path)

# 2) Get all significant genes from this (single) run
tn_sig_genes = tn_gene_stats.loc[
    tn_gene_stats["significance_RECODE"].astype(str) == "significant",
    "gene"
].tolist()
print("TN significant genes in stats:", len(tn_sig_genes))

# 3) Create union and 'atleast2' lists
#    With no chunking, we treat 'atleast2' == union so downstream code works.
tn_union = sorted(set(tn_sig_genes))
tn_atleast2 = tn_union

print("TN union HVGs:", len(tn_union))
print("TN HVGs significant in ≥2 chunks (using union here):", len(tn_atleast2))

# 4) Save union and atleast2
pd.Series(tn_union, name="gene").to_csv(
    tn_out_dir / "TN_RECODE_sig_genes_union.txt", index=False
)
pd.Series(tn_atleast2, name="gene").to_csv(
    tn_out_dir / "TN_RECODE_sig_genes_atleast2.txt", index=False
)
print("Saved TN_RECODE_sig_genes_union.txt and TN_RECODE_sig_genes_atleast2.txt")

TN significant genes in stats: 16380
TN union HVGs: 16380
TN HVGs significant in ≥2 chunks (using union here): 16380
Saved TN_RECODE_sig_genes_union.txt and TN_RECODE_sig_genes_atleast2.txt


In [8]:
# 5) Build WGCNA matrix using TN_RECODE_sig_genes_atleast2.txt

# Reload original TN AnnData (not chunked)
tn_path = base / "results" / "adata_triplenegative_epithelial_improved.h5ad"
adata_tn = sc.read_h5ad(tn_path)
adata_tn.obs_names_make_unique()

# Read merged genes
merged_genes_tn = pd.read_csv(
    tn_out_dir / "TN_RECODE_sig_genes_atleast2.txt"
)["gene"].tolist()

# Subset
adata_tn_hvg = adata_tn[:, merged_genes_tn].copy()

# Save cells×genes matrix for WGCNA
tn_wgcna_path = tn_out_dir / "TN_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_tn_hvg.X.toarray() if hasattr(adata_tn_hvg.X, "toarray") else adata_tn_hvg.X,
    index=adata_tn_hvg.obs_names,
    columns=adata_tn_hvg.var_names,
).to_csv(tn_wgcna_path)

print("Saved TN WGCNA input:", tn_wgcna_path)

Saved TN WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/tn/TN_RECODE_sig_genes_cellsxgenes_forWGCNA.csv


In [11]:
from pathlib import Path
import scanpy as sc
import pandas as pd
import numpy as np

base = Path("/triumvirate/home/alexarol/breast_cancer_analysis")
recode_out_dir = base / "results" / "recode_outputs"

pre_out_dir = recode_out_dir / "preneoplastic"   # this is your directory

# 1) Read preneoplastic gene stats (actual filename)
pre_stats_path = pre_out_dir / "BRCA1_PreNeoplastic_RECODE_gene_stats.csv"
pre_gene_stats = pd.read_csv(pre_stats_path)

# 2) Significant genes
pre_sig_genes = pre_gene_stats.loc[
    pre_gene_stats["significance_RECODE"].astype(str) == "significant",
    "gene"
].tolist()
print("Preneoplastic significant genes:", len(pre_sig_genes))

# 3) Union and atleast2 (no chunking → same list)
pre_union = sorted(set(pre_sig_genes))
pre_atleast2 = pre_union

print("Preneoplastic union HVGs:", len(pre_union))
print("Preneoplastic HVGs significant in ≥2 chunks (using union):", len(pre_atleast2))

# 4) Save with consistent naming
pd.Series(pre_union, name="gene").to_csv(
    pre_out_dir / "BRCA1_PreNeoplastic_RECODE_sig_genes_union.txt", index=False
)
pd.Series(pre_atleast2, name="gene").to_csv(
    pre_out_dir / "BRCA1_PreNeoplastic_RECODE_sig_genes_atleast2.txt", index=False
)
print("Saved BRCA1_PreNeoplastic_RECODE_sig_genes_union.txt and BRCA1_PreNeoplastic_RECODE_sig_genes_atleast2.txt")

Preneoplastic significant genes: 15864
Preneoplastic union HVGs: 15864
Preneoplastic HVGs significant in ≥2 chunks (using union): 15864
Saved BRCA1_PreNeoplastic_RECODE_sig_genes_union.txt and BRCA1_PreNeoplastic_RECODE_sig_genes_atleast2.txt


In [13]:
pre_path = base / "results" / "adata_brca1_preneoplastic_epithelial_improved.h5ad"  # adjust if needed
adata_pre = sc.read_h5ad(pre_path)
adata_pre.obs_names_make_unique()

merged_genes_pre = pd.read_csv(
    pre_out_dir / "BRCA1_PreNeoplastic_RECODE_sig_genes_atleast2.txt"
)["gene"].tolist()

adata_pre_hvg = adata_pre[:, merged_genes_pre].copy()

pre_wgcna_path = pre_out_dir / "BRCA1_PreNeoplastic_RECODE_sig_genes_cellsxgenes_forWGCNA.csv"
pd.DataFrame(
    adata_pre_hvg.X.toarray() if hasattr(adata_pre_hvg.X, "toarray") else adata_pre_hvg.X,
    index=adata_pre_hvg.obs_names,
    columns=adata_pre_hvg.var_names,
).to_csv(pre_wgcna_path)

print("Saved Preneoplastic WGCNA input:", pre_wgcna_path)

Saved Preneoplastic WGCNA input: /triumvirate/home/alexarol/breast_cancer_analysis/results/recode_outputs/preneoplastic/BRCA1_PreNeoplastic_RECODE_sig_genes_cellsxgenes_forWGCNA.csv
