COMPREHENSIVE GENE CO-EXPRESSION NETWORK ANALYSIS

Refactored for: Robust metrics, Pre-neoplastic integration, Fold-change, Enrichment

WHY THIS APPROACH:
- Multiple correlation metrics (Spearman, Kendall, Blomqvist, Pearson, Hoeffding, MI)
- Weighted robust averaging (not single metric)
- Deep exploratory analysis before networks
- Domain-specific biological signatures
- Expression directionality (up/down regulation)
- Therapeutic target identification

# SECTION 1: SETUP & DATA INTEGRATION

WHY:
- Load all 6 datasets (5 cancer groups + pre-neoplastic)
- Verify data integrity
- Prepare for downstream analysis

DATA:
- 6 adata_*_epithelial_improved.h5ad files from Phase 2

OUTPUT:
- All 6 datasets loaded in memory
- Data structure verified

NEXT USE:
- All downstream analyses

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
from pathlib import Path
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import spearmanr, kendalltau, rankdata    # ← UPDATED
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import networkx as nx
from itertools import combinations
import subprocess
import sys
import time                                     # ← ADDED
subprocess.check_call([sys.executable, "-m", "pip", "install", "gseapy", "-q"])
import gseapy as gp
import io
import contextlib
import pickle
import psutil
import os
from datetime import datetime
import shutil
import gc

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

#defined directories
BASE_DIR = Path('/triumvirate/home/alexarol/breast_cancer_analysis')
RESULTS_DIR = BASE_DIR / 'results'
PHASE3_DIR = RESULTS_DIR / 'phase3_networks_refactored'
PHASE3_DIR.mkdir(exist_ok=True)

#defined subfolders
(PHASE3_DIR / 'exploratory_analysis').mkdir(exist_ok=True)
(PHASE3_DIR / 'correlation_metrics').mkdir(exist_ok=True)
(PHASE3_DIR / 'networks').mkdir(exist_ok=True)
(PHASE3_DIR / 'enrichment').mkdir(exist_ok=True)
(PHASE3_DIR / 'expression_directionality').mkdir(exist_ok=True)
(PHASE3_DIR / 'hub_analysis').mkdir(exist_ok=True)
(PHASE3_DIR / 'comparison').mkdir(exist_ok=True)
(PHASE3_DIR / 'therapeutic_targets').mkdir(exist_ok=True)

#defined group names and file paths
GROUPS = {
    'Normal': RESULTS_DIR / 'adata_normal_epithelial_improved.h5ad',
    'ER_Positive': RESULTS_DIR / 'adata_er_positive_epithelial_improved.h5ad',
    'HER2_Positive': RESULTS_DIR / 'adata_her2_positive_epithelial_improved.h5ad',
    'TripleNegative': RESULTS_DIR / 'adata_triplenegative_epithelial_improved.h5ad',
    'TripleNegative_BRCA1': RESULTS_DIR / 'adata_triplenegative_brca1_epithelial_improved.h5ad',
    'BRCA1_PreNeoplastic': RESULTS_DIR / 'adata_brca1_preneoplastic_epithelial_improved.h5ad',
}

#loaded all datasets
print(f'loading {len(GROUPS)} epithelial datasets...\n')

datasets = {}
for group_name, file_path in GROUPS.items():
    if file_path.exists():
        adata = sc.read_h5ad(file_path)
        datasets[group_name] = adata
        print(f'✓ {group_name:25} {adata.n_obs:,} cells × {adata.n_vars:,} genes')
    else:
        print(f'✗ {group_name:25} FILE NOT FOUND: {file_path}')

print(f'\nOK all datasets loaded\n')

#verified data structure
print(f'verifying data structure:\n')
for group_name, adata in datasets.items():
    print(f'{group_name}:')
    print(f'  adata.X type: {type(adata.X)} (should be sparse matrix)')
    print(f'  X format: {"raw UMI counts" if adata.X.max() < 1000 else "possibly normalized"}')
    print(f'  adata.obs columns: {list(adata.obs.columns)}')
    print(f'  sample_types: {adata.obs["sample_type"].unique() if "sample_type" in adata.obs else "NOT FOUND"}')
    print()

print(f'OK data structure verified\n')

loading 6 epithelial datasets...

✓ Normal                    83,522 cells × 33,514 genes
✓ ER_Positive               91,908 cells × 33,514 genes
✓ HER2_Positive             19,693 cells × 33,514 genes
✓ TripleNegative            7,561 cells × 33,514 genes
✓ TripleNegative_BRCA1      14,186 cells × 33,514 genes
✓ BRCA1_PreNeoplastic       7,644 cells × 33,514 genes

OK all datasets loaded

verifying data structure:

Normal:
  adata.X type: <class 'scipy.sparse._csr.csr_matrix'> (should be sparse matrix)
  X format: possibly normalized
  adata.obs columns: ['barcode', 'sample_name', 'sample_type', 'geo_id', 'cell_type', 'epithelial_score', 'immune_score', 'molecular_subtype']
  sample_types: ['Normal']
Categories (1, object): ['Normal']

ER_Positive:
  adata.X type: <class 'scipy.sparse._csr.csr_matrix'> (should be sparse matrix)
  X format: possibly normalized
  adata.obs columns: ['barcode', 'sample_name', 'sample_type', 'geo_id', 'cell_type', 'epithelial_score', 'immune_score', 'mole

# SECTION 1.5: DATA PREPROCESSING

WHY:
- Normalize raw UMI counts to log-scale
- Filter highly variable genes (HVGs)
- Prepare for network analysis

DATA:
- Raw UMI count matrices

OUTPUT:
- Log-normalized, HVG-filtered datasets

NEXT USE:
- Co-expression metric calculation

In [5]:
#processing parameters
N_HVG = 3000  #number of highly variable genes to keep
MIN_GENES = 200  #minimum genes per cell
MIN_CELLS = 3  #minimum cells per gene

processed_datasets = {}

for group_name, adata in datasets.items():
    print(f'processing {group_name}...', end=' ', flush=True)
    
    #made copy to avoid modifying original
    adata = adata.copy()
    
    #basic QC: filter by gene detection
    sc.pp.filter_cells(adata, min_genes=MIN_GENES)
    sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
    
    #normalized to library size (counts per 10,000)
    sc.pp.normalize_total(adata, target_sum=1e4)
    
    #log-transformed (log1p)
    sc.pp.log1p(adata)
    
    #identified highly variable genes
    sc.pp.highly_variable_genes(adata, n_top_genes=N_HVG, batch_key=None)
    
    #subset to HVGs
    adata_hvg = adata[:, adata.var['highly_variable']].copy()
    
    #stored in processed dict
    processed_datasets[group_name] = adata_hvg
    
    print(f'OK {adata_hvg.n_obs:,} cells × {adata_hvg.n_vars:,} HVGs')

print(f'\nOK preprocessing complete\n')

#saved processed datasets for future reference
print(f'saving processed datasets...\n')
for group_name, adata in processed_datasets.items():
    out_file = PHASE3_DIR / f'adata_{group_name.lower()}_hvg_processed.h5ad'
    adata.write(out_file)
    print(f'✓ {out_file.name}')

print(f'\nOK all processed datasets saved\n')

processing Normal... OK 82,380 cells × 3,000 HVGs
processing ER_Positive... OK 91,206 cells × 3,000 HVGs
processing HER2_Positive... OK 19,554 cells × 3,000 HVGs
processing TripleNegative... OK 7,514 cells × 3,000 HVGs
processing TripleNegative_BRCA1... OK 14,100 cells × 3,000 HVGs
processing BRCA1_PreNeoplastic... OK 7,616 cells × 3,000 HVGs

OK preprocessing complete

saving processed datasets...

✓ adata_normal_hvg_processed.h5ad
✓ adata_er_positive_hvg_processed.h5ad
✓ adata_her2_positive_hvg_processed.h5ad
✓ adata_triplenegative_hvg_processed.h5ad
✓ adata_triplenegative_brca1_hvg_processed.h5ad
✓ adata_brca1_preneoplastic_hvg_processed.h5ad

OK all processed datasets saved



# SECTION 2: COMPREHENSIVE EXPLORATORY ANALYSIS

WHY:
- Deeply understand data before network analysis
- Identify technical artifacts
- Characterize biological signals
- Validate group differences

DATA:
- All 6 processed datasets

OUTPUT:
- 6+ exploratory figures
- Summary statistics

NEXT USE:
- Context for network interpretation


## SUBSECTION 2.1: Sample Quality Assessment

In [6]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

#library size distribution
for idx, (group_name, adata) in enumerate(processed_datasets.items()):
    ax = axes[idx // 3, idx % 3]
    
    library_size = adata.X.sum(axis=1)
    ax.hist(library_size, bins=50, alpha=0.7, color='steelblue')
    ax.set_xlabel('Library Size (Log scale)')
    ax.set_ylabel('Number of Cells')
    ax.set_title(f'{group_name}')
    ax.set_xscale('log')

plt.tight_layout()
out_file = PHASE3_DIR / 'exploratory_analysis' / 'sample_quality_library_size.png'
plt.savefig(out_file, dpi=300, bbox_inches='tight')
plt.close()

print(f'OK')

OK


## SUBSECTION 2.2: Gene Detection Rate

In [7]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for idx, (group_name, adata) in enumerate(processed_datasets.items()):
    ax = axes[idx // 3, idx % 3]
    
    genes_per_cell = (adata.X > 0).sum(axis=1)
    ax.hist(genes_per_cell, bins=50, alpha=0.7, color='coral')
    ax.set_xlabel('Genes Detected per Cell')
    ax.set_ylabel('Number of Cells')
    ax.set_title(f'{group_name}')

plt.tight_layout()
out_file = PHASE3_DIR / 'exploratory_analysis' / 'sample_quality_genes_per_cell.png'
plt.savefig(out_file, dpi=300, bbox_inches='tight')
plt.close()

print(f'OK')

OK


## SUBSECTION 2.3: Zero Inflation

In [8]:
zero_inflation_data = []

for group_name, adata in processed_datasets.items():
    total_values = adata.n_obs * adata.n_vars
    zero_count = total_values - adata.X.nnz  #non-zero count
    zero_pct = 100 * zero_count / total_values
    
    zero_inflation_data.append({
        'Group': group_name,
        'Zero_Inflation_%': zero_pct,
    })

zero_df = pd.DataFrame(zero_inflation_data)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(zero_df['Group'], zero_df['Zero_Inflation_%'], color='steelblue')
ax.set_ylabel('Zero Inflation (%)')
ax.set_title('Sparsity Across Groups')
ax.set_xticklabels(zero_df['Group'], rotation=45, ha='right')
plt.tight_layout()
out_file = PHASE3_DIR / 'exploratory_analysis' / 'zero_inflation.png'
plt.savefig(out_file, dpi=300, bbox_inches='tight')
plt.close()

print(f'OK')

OK


my thoughts on high sparsity:

scRNA-seq data is inherently sparse because:
- Each cell expresses only ~30-40% of all genes
- Many genes have zero expression in a cell
- Only ~3,000 HVGs selected (of 33,514 total)
- Among HVGs: ~1,500-2,000 expressed per cell

  3,000 HVGs × 222,370 cells = 667 million possible values
  But only ~60-70 million non-zero values
  → ~90-95% zeros


## SUBSECTION 2.4: Dimensionality Reduction (PCA)

https://builtin.com/data-science/step-step-explanation-principal-component-analysis


In [9]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for idx, (group_name, adata) in enumerate(processed_datasets.items()):
    ax = axes[idx // 3, idx % 3]
    
    #performed PCA
    sc.pp.pca(adata, n_comps=50)
    
    #plotted variance explained
    variance_ratio = adata.uns['pca']['variance_ratio'][:20]
    ax.plot(variance_ratio, marker='o')
    ax.set_xlabel('PC')
    ax.set_ylabel('Variance Explained')
    ax.set_title(f'{group_name}')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
out_file = PHASE3_DIR / 'exploratory_analysis' / 'pca_variance_explained.png'
plt.savefig(out_file, dpi=300, bbox_inches='tight')
plt.close()

print(f'OK')

OK


- All plots show "elbow curve" pattern
    - PC1 explains most variance
    - PC2-PC5 explain progressively less
    - After PC10, very small contributions


- Biological interpretation:
    - Normal tissue: More uniform (lower PC1 variance)
    - Cancer tissue: More heterogeneous (higher PC1 variance)
    - BRCA1-mutant: Most aggressive (highest variance)
    - Pre-neoplastic: Transitioning (intermediate variance)

- No batch effects visible:
    - All curves follow same smooth pattern
    - No sudden jumps or anomalies
    - Consistent across groups

- Variance distribution is realistic:
    - PC1: 6-18% (high)
    - PC2: ~4-6% (moderate)
    - PC3-5: 1-3% (lower)
    - PC10+: <0.5% (noise)

## SUBSECTION 2.5: Expression Distribution

In [10]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for idx, (group_name, adata) in enumerate(processed_datasets.items()):
    ax = axes[idx // 3, idx % 3]
    
    #mean expression per gene
    mean_expr = np.asarray(adata.X.mean(axis=0)).flatten()
    
    ax.hist(mean_expr, bins=50, alpha=0.7, color='green')
    ax.set_xlabel('Mean Expression (Log scale)')
    ax.set_ylabel('Number of Genes')
    ax.set_title(f'{group_name}')
    ax.set_xscale('log')

plt.tight_layout()
out_file = PHASE3_DIR / 'exploratory_analysis' / 'expression_distribution.png'
plt.savefig(out_file, dpi=300, bbox_inches='tight')
plt.close()

print(f'OK')

OK


- Most genes: LOW expression (10⁻⁴ to 10⁻² range)
- Some genes: MEDIUM expression (10⁻¹ range)
- Few genes: HIGH expression (10⁰ range - the small tail)

just some thought: Spearman & Kendall could work best on this distribution - Log-scale naturally handles wide dynamic range

## SUBSECTION 2.6: HVG Overlap

In [11]:
#extracted HVG names per group
hvg_sets = {group_name: set(adata.var_names) for group_name, adata in processed_datasets.items()}

#counted overlaps
overlap_matrix = np.zeros((len(hvg_sets), len(hvg_sets)))
group_names_list = list(hvg_sets.keys())

for i, g1 in enumerate(group_names_list):
    for j, g2 in enumerate(group_names_list):
        overlap = len(hvg_sets[g1] & hvg_sets[g2])
        overlap_matrix[i, j] = overlap

#plotted heatmap
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(overlap_matrix, annot=True, fmt='.0f', xticklabels=group_names_list, 
            yticklabels=group_names_list, cmap='YlOrRd', ax=ax)
ax.set_title('HVG Overlap Between Groups')
plt.tight_layout()
out_file = PHASE3_DIR / 'exploratory_analysis' / 'hvg_overlap.png'
plt.savefig(out_file, dpi=300, bbox_inches='tight')
plt.close()

print(f'OK')

OK


| Pair                        | Overlap | % Shared | Interpretation                   |
| --------------------------- | ------- | -------- | -------------------------------- |
| Normal ↔ ER_Positive        | 861     | 28.7%    | Moderate overlap                 |
| Normal ↔ HER2_Positive      | 903     | 30.1%    | Moderate overlap                 |
| Normal ↔ TNBC               | 736     | 24.5%    | Lower (more different)           |
| Normal ↔ TNBC-BRCA1         | 842     | 28.1%    | Moderate overlap                 |
| Normal ↔ Pre-neoplastic     | 1,037   | 34.6%    | HIGHEST!                         |
| ER_Positive ↔ HER2_Positive | 1,095   | 36.5%    | High overlap (both luminal-like) |
| ER_Positive ↔ TNBC          | 722     | 24.1%    | Low overlap (very different)     |
| ER_Positive ↔ TNBC-BRCA1    | 905     | 30.2%    | Moderate                         |
| HER2_Positive ↔ TNBC        | 844     | 28.1%    | Moderate                         |
| HER2_Positive ↔ TNBC-BRCA1  | 1,157   | 38.6%    | HIGHEST!                         |
| TNBC ↔ TNBC-BRCA1           | 955     | 31.8%    | Moderate                         |
| TNBC ↔ Pre-neoplastic       | 887     | 29.6%    | Moderate                         |
| TNBC-BRCA1 ↔ Pre-neoplastic | 909     | 30.3%    | Moderate                         |

1. Pre-neoplastic is Most Similar to Normal 
    Normal to Pre-neoplastic: 1,037 shared genes (34.6%)

    Highest overlap with Normal!

    Shows pre-neoplastic is early-stage transition

    Perfect for showing progression story!

2. HER2+ and TNBC-BRCA1 are Related 
    HER2_Positive to TNBC-BRCA1: 1,157 shared genes (38.6%)

    Highest overlap between cancer subtypes

    Both aggressive subtypes

    Both have different metabolic profiles

3. ER+ and HER2+ are More Similar 
    ER_Positive to HER2_Positive: 1,095 shared genes (36.5%)

    Both are initially hormone-responsive

    Less heterogeneous than TNBC

4. TNBC is Most Different 
    TNBC has lowest overlaps with most groups (24-31%)

    Exception: TNBC-BRCA1 (31.8%)

    Shows TNBC has unique biology

    Makes sense for aggressive phenotype

seems like everything is alright, but i am considering analysing these plots as well: https://scanpy.readthedocs.io/en/stable/tutorials/plotting/core.html

will add it a bit later, WHY? (SECTION 6B, 7B AND 9B)

- Dotplot (sc.pl.dotplot)
    - Perfect for showing hub gene expression across groups
    - Can group by molecular_subtype or cell_type
    - Shows mean expression + % cells expressing

- Heatmap (sc.pl.heatmap)
    - Excellent for showing individual cell expression patterns
    - Can visualize hub genes × cells
    - Can show fold-change patterns

- Violin plots (sc.pl.violin)
    - Compare expression distributions across groups
    - Perfect for hub genes or pathway genes
    - Shows distribution shape (not just mean)

- Rank genes (sc.tl.rank_genes_groups + visualization)
    - Identify differentially expressed genes per group
    - Show log fold-change
    - Can prioritize therapeutic targets
 
- Correlation matrix (sc.pl.correlation_matrix)
    - Perfect for comparing group relationships
    - Can show if Normal/Pre-neoplastic/Cancer form progression

# SECTION 3: ROBUST MULTI-METRIC CO-EXPRESSION ANALYSIS

WHY:
- single correlation metric can be biased or fooled by outliers
- three independent metrics capture different association types
- weighted averaging creates robust consensus: if all 3 metrics agree → strong signal

OPTIMIZATION STRATEGY:
- process one group at a time (not all 6 together)
- save EVERY metric separately (checkpoint recovery)
- monitor resources in real-time (CPU, memory, time)
- validate after each step (catch errors immediately)
- log everything to persistent text file

WORKFLOW PER GROUP:
1. Load & preprocess (normalize, log, HVGs)
2. Calculate Spearman → save + validate + log
3. Calculate BICOR → save + validate + log
4. Calculate Pearson → save + validate + log
5. Merge robustly (0.40S + 0.35B + 0.25P) → save
6. Visualize comparison plots
7. Final diagnostics & cleanup


## SUBSECTION 3.0: HELPER FUNCTIONS FOR AUTOMATION

WHY:
- automate repetitive tasks (preprocess, calculate, combine, save, visualize)
- reduce code duplication (write once, use many times)
- make workflow transparent (each function does one clear job)
- enable testing (can call functions individually)

In [28]:
def get_resource_usage():
    '''retrieved current CPU and memory usage
    
    #output:
    #  dict with: cpu_percent, memory_gb, memory_percent, timestamp
    '''
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    memory_gb = memory_info.rss / (1024**3)
    
    #total system memory
    total_memory_gb = psutil.virtual_memory().total / (1024**3)
    memory_percent = (memory_gb / total_memory_gb) * 100
    
    return {
        'cpu_percent': process.cpu_percent(interval=0.1),
        'memory_gb': memory_gb,
        'memory_percent': memory_percent,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    }

def log_to_file(message, group_name):
    '''appended message to persistent log file
    
    #input:
    #  message: text to log
    #  group_name: group identifier (used in filename)
    #
    #output:
    #  written to: phase3_networks_refactored/correlation_metrics/analysis_log_{group_name}.txt
    '''
    log_path = PHASE3_DIR / 'correlation_metrics' / f'analysis_log_{group_name.lower()}.txt'
    
    #appended (create if doesn't exist)
    with open(log_path, 'a') as f:
        f.write(f'[{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}] {message}\n')

def inspect_correlation_matrix(matrix, metric_name, group_name, n_samples=10):
    '''inspected correlation matrix structure and validity
    
    #input:
    #  matrix: correlation matrix (n_genes × n_genes)
    #  metric_name: 'spearman', 'bicor', 'pearson', or 'robust'
    #  group_name: group identifier (for logging)
    #  n_samples: number of random entries to display
    #
    #output:
    #  printed inspection results
    #  logged to file
    '''
    n_genes = matrix.shape[0]
    diag_values = np.diag(matrix)
    has_nans = np.isnan(matrix).sum()
    has_infs = np.isinf(matrix).sum()
    value_min = np.nanmin(matrix)
    value_max = np.nanmax(matrix)
    diag_mean = np.mean(diag_values)
    
    #random samples
    upper_triangle_idx = np.triu_indices(n_genes, k=1)
    if len(upper_triangle_idx[0]) > 0:
        sample_idx = np.random.choice(len(upper_triangle_idx[0]), min(n_samples, len(upper_triangle_idx[0])), replace=False)
        sample_values = matrix[upper_triangle_idx[0][sample_idx], upper_triangle_idx[1][sample_idx]]
    else:
        sample_values = []
    
    #memory usage
    memory_mb = matrix.nbytes / (1024**2)
    
    #validation status
    is_valid = (has_nans == 0 and has_infs == 0 and -1 <= value_min and value_max <= 1)
    status = 'OK VALID' if is_valid else '✗ INVALID'
    
    #printed output
    print(f'\n{"="*80}')
    print(f'MATRIX INSPECTION: {metric_name.upper()}')
    print(f'{"="*80}')
    print(f'  Shape:           {matrix.shape[0]:,} × {matrix.shape[1]:,} genes')
    print(f'  Memory:          {memory_mb:,.1f} MB')
    print(f'  Diagonal:        mean={diag_mean:.4f} (should be 1.0)')
    print(f'  Range:           [{value_min:8.4f}, {value_max:8.4f}] (should be [-1, 1])')
    print(f'  NaNs:            {has_nans:,} (should be 0)')
    print(f'  Infs:            {has_infs:,} (should be 0)')
    print(f'  Sample values:   {sample_values}')
    print(f'  Status:          {status}')
    print(f'{"="*80}\n')
    
    #logged to file
    log_msg = f'{metric_name.upper()} - shape={n_genes}x{n_genes}, memory={memory_mb:.1f}MB, range=[{value_min:.4f}, {value_max:.4f}], NaNs={has_nans}, status={status}'
    log_to_file(log_msg, group_name)
    
    return is_valid

def save_metric_intermediate(matrix, group_name, metric_name):
    '''saved individual metric to pickle file (checkpoint)
    
    #input:
    #  matrix: correlation matrix
    #  group_name: group identifier
    #  metric_name: 'spearman', 'bicor', 'pearson', or 'robust'
    #
    #output:
    #  saved to: phase3_networks_refactored/correlation_metrics/{group_name}_{metric_name}.pkl
    '''
    output_path = PHASE3_DIR / 'correlation_metrics' / f'{group_name.lower()}_{metric_name.lower()}.pkl'
    
    with open(output_path, 'wb') as f:
        pickle.dump(matrix, f)
    
    file_size_mb = output_path.stat().st_size / (1024**2)
    message = f'Saved {metric_name} matrix ({file_size_mb:.1f} MB) → {output_path.name}'
    print(f'✓ {message}')
    log_to_file(message, group_name)
    
    return output_path

def load_metric_intermediate(group_name, metric_name):
    '''loaded individual metric from pickle file
    
    #input:
    #  group_name: group identifier
    #  metric_name: 'spearman', 'bicor', 'pearson', or 'robust'
    #
    #output:
    #  correlation matrix or None if not found
    '''
    input_path = PHASE3_DIR / 'correlation_metrics' / f'{group_name.lower()}_{metric_name.lower()}.pkl'
    
    if not input_path.exists():
        return None
    
    with open(input_path, 'rb') as f:
        matrix = pickle.load(f)
    
    return matrix

def generate_progress_bar(current, total, metric_name, start_time, gene_count=None):
    '''generated progress bar with ETA
    
    #input:
    #  current: current gene pair count
    #  total: total gene pairs (~n*(n-1)/2)
    #  metric_name: 'spearman', 'bicor', 'pearson'
    #  start_time: time.time() from start
    #  gene_count: number of genes (optional, for display)
    '''
    elapsed = time.time() - start_time
    progress = current / total if total > 0 else 0
    
    #progress bar
    bar_length = 30
    filled = int(bar_length * progress)
    bar = '█' * filled + '░' * (bar_length - filled)
    
    #ETA calculation
    if elapsed > 0 and progress > 0:
        total_time = elapsed / progress
        remaining = total_time - elapsed
        eta_min = remaining / 60
    else:
        eta_min = 0
    
    #rate (pairs per second)
    if elapsed > 0:
        rate = current / elapsed
    else:
        rate = 0
    
    return f'{metric_name:10} [{bar}] {current:,}/{total:,} ({progress*100:5.1f}%) | {rate:6.2f} pairs/sec | ETA: {eta_min:6.1f}m'

def preprocess_for_correlations(adata, n_hvgs=3000, group_name=None):
    '''preprocessed expression data for correlation analysis
    
    #input:
    #  adata: anndata object with raw UMI counts
    #  n_hvgs: number of highly variable genes to keep (default: 3000)
    #  group_name: group identifier (for logging)
    #
    #output:
    #  adata_processed: normalized, log-transformed, HVG-filtered
    #  X dense matrix ready for correlation calculation
    '''
    adata_proc = adata.copy()
    
    #normalized to library size (10,000 UMI per cell)
    sc.pp.normalize_total(adata_proc, target_sum=1e4)
    
    #applied log transformation
    sc.pp.log1p(adata_proc)
    
    #identified highly variable genes
    sc.pp.highly_variable_genes(adata_proc, n_top_genes=n_hvgs)
    
    #subset to HVGs
    adata_proc = adata_proc[:, adata_proc.var['highly_variable']]
    
    if group_name:
        log_to_file(f'Preprocessing: {adata_proc.n_obs:,} cells × {adata_proc.n_vars:,} HVGs', group_name)
    
    return adata_proc

def calculate_spearman_correlation(X_dense, group_name=None):
    '''calculated Spearman rank correlation for all gene pairs
    
    #WHY Spearman:
    #- rank-based (ignores magnitude of values)
    #- robust to outliers in expression values
    #- monotonic relationships (genes move together)
    #- standard in genomics literature
    '''
    n_genes = X_dense.shape[1]
    spearman_corr = np.zeros((n_genes, n_genes))
    
    print(f'\n  calculating Spearman correlations for {n_genes:,} genes...')
    start_time = time.time()
    total_pairs = n_genes * (n_genes - 1) / 2
    pair_count = 0
    
    #calculated pairwise Spearman correlations
    for i in range(n_genes):
        for j in range(i+1, n_genes):
            x = X_dense[:, i]
            y = X_dense[:, j]
            spearman_corr[i, j], _ = spearmanr(x, y)
            spearman_corr[j, i] = spearman_corr[i, j]
            
            pair_count += 1
            
            #progress every 500 pairs
            if pair_count % 500 == 0:
                progress_str = generate_progress_bar(pair_count, int(total_pairs), 'Spearman', start_time, n_genes)
                print(f'    {progress_str}', end='\r')
    
    #set diagonal to 1 (self-correlation)
    np.fill_diagonal(spearman_corr, 1.0)
    elapsed = time.time() - start_time
    
    print(f'    OK Spearman complete ({elapsed/60:.1f} minutes, {pair_count:,} pairs)\n')
    
    if group_name:
        log_to_file(f'Spearman calculation: {elapsed/60:.1f} minutes, {pair_count:,} pairs', group_name)
    
    return spearman_corr

def calculate_bicor_correlation(X_dense, group_name=None):
    '''calculated biweight midcorrelation (BICOR) for all gene pairs
    
    #WHY BICOR:
    #- weighted Pearson correlation (keeps linear information)
    #- uses median + MAD (median absolute deviation) for robustness
    #- downweights outliers WITHOUT removing them completely
    #- specifically designed for genomics
    '''
    n_genes = X_dense.shape[1]
    bicor_corr = np.zeros((n_genes, n_genes))
    
    print(f'\n  calculating BICOR correlations for {n_genes:,} genes...')
    start_time = time.time()
    
    #calculated medians and MAD per gene (robust to outliers)
    medians = np.median(X_dense, axis=0)
    
    try:
        from scipy.stats import median_abs_deviation as mad_func
        mads = np.array([mad_func(X_dense[:, i]) for i in range(n_genes)])
    except ImportError:
        #fallback if median_abs_deviation not available
        mads = np.array([np.median(np.abs(X_dense[:, i] - medians[i])) for i in range(n_genes)])
    
    total_pairs = n_genes * (n_genes - 1) / 2
    pair_count = 0
    
    #calculated pairwise BICOR
    for i in range(n_genes):
        for j in range(i+1, n_genes):
            x = X_dense[:, i]
            y = X_dense[:, j]
            
            #avoided division by zero
            if mads[i] == 0 or mads[j] == 0:
                bicor_corr[i, j] = 0
                bicor_corr[j, i] = 0
                pair_count += 1
                continue
            
            #standardized using median and MAD (robust to outliers)
            u = (x - medians[i]) / (9 * mads[i])
            v = (y - medians[j]) / (9 * mads[j])
            
            #applied biweight function (smooth downweighting of outliers)
            w_x = (1 - u**2)**2 * (np.abs(u) < 1)
            w_y = (1 - v**2)**2 * (np.abs(v) < 1)
            
            #calculated weighted correlation
            numerator = np.sum(w_x * w_y * u * v)
            denominator = np.sqrt(np.sum(w_x * w_y * u**2) * np.sum(w_x * w_y * v**2))
            
            #avoided division by zero
            if denominator > 0:
                bicor_corr[i, j] = numerator / denominator
            else:
                bicor_corr[i, j] = 0
            
            bicor_corr[j, i] = bicor_corr[i, j]
            pair_count += 1
            
            #progress every 500 pairs
            if pair_count % 500 == 0:
                progress_str = generate_progress_bar(pair_count, int(total_pairs), 'BICOR', start_time, n_genes)
                print(f'    {progress_str}', end='\r')
    
    #set diagonal to 1
    np.fill_diagonal(bicor_corr, 1.0)
    elapsed = time.time() - start_time
    
    print(f'    OK BICOR complete ({elapsed/60:.1f} minutes, {pair_count:,} pairs)\n')
    
    if group_name:
        log_to_file(f'BICOR calculation: {elapsed/60:.1f} minutes, {pair_count:,} pairs', group_name)
    
    return bicor_corr

def calculate_pearson_correlation(X_dense, group_name=None):
    '''calculated Pearson correlation for all gene pairs
    
    #WHY Pearson:
    #- captures linear relationships
    #- baseline for validation
    #- fast to compute (vectorized)
    '''
    n_genes = X_dense.shape[1]
    print(f'\n  calculating Pearson correlations for {n_genes:,} genes...')
    start_time = time.time()
    
    #used NumPy's vectorized corrcoef for efficiency
    pearson_corr = np.corrcoef(X_dense.T)
    
    elapsed = time.time() - start_time
    print(f'    OK Pearson complete ({elapsed/60:.1f} minutes, vectorized)\n')
    
    if group_name:
        log_to_file(f'Pearson calculation: {elapsed/60:.1f} minutes (vectorized)', group_name)
    
    return pearson_corr

def combine_correlations_robust(spearman, bicor, pearson):
    '''combined three correlations via weighted averaging
    
    #WEIGHTING RATIONALE:
    #0.40 × Spearman: highest weight (most proven + standard in genomics)
    #0.35 × BICOR: high weight (specialized for genomics + complementary)
    #0.25 × Pearson: lower weight (validation check + less robust)
    '''
    robust_corr = (0.40 * spearman + 0.35 * bicor + 0.25 * pearson)
    return robust_corr

def generate_metric_comparison_plots(group_name, spearman, bicor, pearson, robust):
    '''generated comparison plots for three correlation metrics
    
    #input:
    #  group_name: group identifier
    #  spearman, bicor, pearson, robust: correlation matrices
    #
    #output:
    #  saved PNG file with 4 comparison plots
    '''
    print(f'\n  generating metric comparison plots for {group_name}...')
    
    #flattened upper triangle
    mask = np.triu(np.ones_like(spearman, dtype=bool), k=1)
    spearman_flat = spearman[mask]
    bicor_flat = bicor[mask]
    pearson_flat = pearson[mask]
    robust_flat = robust[mask]
    
    #created figure with 4 subplots
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    fig.suptitle(f'Correlation Metric Comparison: {group_name}', fontsize=16, fontweight='bold')
    
    #plot 1: Spearman vs BICOR
    ax = axes[0, 0]
    ax.scatter(spearman_flat, bicor_flat, alpha=0.1, s=1)
    ax.set_xlabel('Spearman (rank-based)', fontsize=11)
    ax.set_ylabel('BICOR (weighted linear)', fontsize=11)
    ax.set_title('Spearman vs BICOR\n(different perspectives)', fontsize=12)
    ax.plot([-1, 1], [-1, 1], 'r--', alpha=0.5, linewidth=2)
    ax.set_xlim([-1, 1])
    ax.set_ylim([-1, 1])
    ax.grid(True, alpha=0.3)
    
    #plot 2: Spearman vs Pearson
    ax = axes[0, 1]
    ax.scatter(spearman_flat, pearson_flat, alpha=0.1, s=1)
    ax.set_xlabel('Spearman (robust)', fontsize=11)
    ax.set_ylabel('Pearson (less robust)', fontsize=11)
    ax.set_title('Spearman vs Pearson\n(Pearson less stable)', fontsize=12)
    ax.plot([-1, 1], [-1, 1], 'r--', alpha=0.5, linewidth=2)
    ax.set_xlim([-1, 1])
    ax.set_ylim([-1, 1])
    ax.grid(True, alpha=0.3)
    
    #plot 3: Distribution comparison
    ax = axes[1, 0]
    ax.hist(spearman_flat, bins=100, alpha=0.5, label='Spearman', density=True, color='blue')
    ax.hist(bicor_flat, bins=100, alpha=0.5, label='BICOR', density=True, color='green')
    ax.hist(pearson_flat, bins=100, alpha=0.5, label='Pearson', density=True, color='orange')
    ax.set_xlabel('Correlation Value', fontsize=11)
    ax.set_ylabel('Density', fontsize=11)
    ax.set_title('Distribution Comparison\n(BICOR more robust than Pearson)', fontsize=12)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
    
    #plot 4: Robust vs Spearman
    ax = axes[1, 1]
    ax.scatter(spearman_flat, robust_flat, alpha=0.1, s=1)
    ax.set_xlabel('Spearman (single metric)', fontsize=11)
    ax.set_ylabel('Robust (0.40S + 0.35B + 0.25P)', fontsize=11)
    ax.set_title('Robust Averaging Effect\n(balances three perspectives)', fontsize=12)
    ax.plot([-1, 1], [-1, 1], 'r--', alpha=0.5, linewidth=2)
    ax.set_xlim([-1, 1])
    ax.set_ylim([-1, 1])
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    #saved to file
    output_path = PHASE3_DIR / 'correlation_metrics' / f'metric_comparison_{group_name.lower()}.png'
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f'    OK Plot saved: {output_path.name}\n')
    log_to_file(f'Saved comparison plot: {output_path.name}', group_name)

def cleanup_memory(group_name):
    '''released memory and logged cleanup
    
    #input:
    #  group_name: group identifier (for logging)
    '''
    import gc
    gc.collect()
    
    resources = get_resource_usage()
    message = f'Memory cleanup: {resources["memory_gb"]:.2f} GB / {resources["memory_percent"]:.1f}%'
    print(f'  {message}')
    log_to_file(message, group_name)

print('OK enhanced helper functions defined\n')

OK enhanced helper functions defined



## SUBSECTION 3.1: TEST MODE (Verify on Small Subset First!)

WHY:
- before computing on full 3,000 genes (takes 1-2 hours), test on small subset
- verifies code works, catches errors early, saves time

DATA:
- first 100 genes from Normal group (small subset)

OUTPUT:
- test correlation matrices (100×100)
- diagnostics confirming correlations are valid

NEXT USE:
- if test passes → proceed to full analysis in 3.2

In [32]:
TEST_MODE = False  #set to False after first successful run

if TEST_MODE:
    print(f'TESTING on 100 genes from TripleNegative group (smallest sample)\n')

    print(f'STEP 1: TEST RESOURCE MONITORING')

    resources = get_resource_usage()
    print(f'CPU usage: {resources["cpu_percent"]:.1f}%')
    print(f'Memory: {resources["memory_gb"]:.2f} GB / {psutil.virtual_memory().total / (1024**3):.1f} GB ({resources["memory_percent"]:.1f}%)')
    print(f'Timestamp: {resources["timestamp"]}\n')
   
    print(f'STEP 2: TEST LOGGING FUNCTIONS')
    
    TEST_GROUP = 'TEST_Run'
    log_to_file('Test log message 1', TEST_GROUP)
    log_to_file('Test log message 2', TEST_GROUP)
    log_path = PHASE3_DIR / 'correlation_metrics' / f'analysis_log_{TEST_GROUP.lower()}.txt'
    
    if log_path.exists():
        print(f'✓ Log file created: {log_path.name}')
        with open(log_path, 'r') as f:
            lines = f.readlines()
            print(f'OK Log file contains {len(lines)} entries')
            print(f'  First entry: {lines[0].strip()}')
    else:
        print(f'NOT OK ERROR: Log file not created!\n')
    
    print()

    print(f'STEP 3: LOAD AND PREPROCESS TEST DATA')

    adata_test = datasets['TripleNegative'][:, :100].copy()
    print(f'OK Loaded test data: {adata_test.n_obs:,} cells × {adata_test.n_vars:,} genes')
    
    adata_test_proc = preprocess_for_correlations(adata_test, n_hvgs=100, group_name=TEST_GROUP)
    X_test = adata_test_proc.X.toarray()
    
    print(f'Preprocessed: {X_test.shape} (cells × genes)')
    print(f'Data type: {X_test.dtype}')
    print(f'Memory: {X_test.nbytes / (1024**2):.1f} MB\n')
    
    print(f'STEP 4: TEST SPEARMAN CORRELATION CALCULATION')

    spearman_test = calculate_spearman_correlation(X_test, group_name=TEST_GROUP)
    print(f'Validating Spearman matrix')
    is_valid_spearman = inspect_correlation_matrix(spearman_test, 'Spearman', TEST_GROUP, n_samples=5)
    save_metric_intermediate(spearman_test, TEST_GROUP, 'spearman')

    print(f'\nSTEP 5: TEST BICOR CORRELATION CALCULATION')
    
    bicor_test = calculate_bicor_correlation(X_test, group_name=TEST_GROUP)
    print(f'Validating BICOR matrix')
    is_valid_bicor = inspect_correlation_matrix(bicor_test, 'BICOR', TEST_GROUP, n_samples=5)
    save_metric_intermediate(bicor_test, TEST_GROUP, 'bicor')

    print(f'\nSTEP 6: TEST PEARSON CORRELATION CALCULATION')

    pearson_test = calculate_pearson_correlation(X_test, group_name=TEST_GROUP)
    print(f'Validating Pearson matrix')
    is_valid_pearson = inspect_correlation_matrix(pearson_test, 'Pearson', TEST_GROUP, n_samples=5)
    save_metric_intermediate(pearson_test, TEST_GROUP, 'pearson')

    print(f'\nSTEP 7: TEST ROBUST COMBINATION')

    robust_test = combine_correlations_robust(spearman_test, bicor_test, pearson_test)
    print(f' Robust correlation combined (0.40S + 0.35B + 0.25P)\n')
    print(f'Validating Robust matrix')
    is_valid_robust = inspect_correlation_matrix(robust_test, 'Robust', TEST_GROUP, n_samples=5)
    save_metric_intermediate(robust_test, TEST_GROUP, 'robust')

    print(f'\nSTEP 8: TEST COMPARISON PLOTS')

    generate_metric_comparison_plots(TEST_GROUP, spearman_test, bicor_test, pearson_test, robust_test)

    print(f'\nSTEP 9: COMPARISON OF METRICS')

    #flattened upper triangle for comparison
    mask = np.triu(np.ones_like(spearman_test, dtype=bool), k=1)
    spearman_flat = spearman_test[mask]
    bicor_flat = bicor_test[mask]
    pearson_flat = pearson_test[mask]
    robust_flat = robust_test[mask]
    
    print(f'Metric correlation ranges:')
    print(f'  Spearman: [{spearman_flat.min():.4f}, {spearman_flat.max():.4f}] mean={spearman_flat.mean():.4f}')
    print(f'  BICOR:    [{bicor_flat.min():.4f}, {bicor_flat.max():.4f}] mean={bicor_flat.mean():.4f}')
    print(f'  Pearson:  [{pearson_flat.min():.4f}, {pearson_flat.max():.4f}] mean={pearson_flat.mean():.4f}')
    print(f'  Robust:   [{robust_flat.min():.4f}, {robust_flat.max():.4f}] mean={robust_flat.mean():.4f}')
    print()
    
    #correlation between metrics
    print(f'Agreement between metrics (Pearson correlation of flattened values):')
    corr_spear_bicor = np.corrcoef(spearman_flat, bicor_flat)[0, 1]
    corr_spear_pear = np.corrcoef(spearman_flat, pearson_flat)[0, 1]
    corr_bicor_pear = np.corrcoef(bicor_flat, pearson_flat)[0, 1]
    
    print(f'  Spearman vs BICOR:    {corr_spear_bicor:.4f}')
    print(f'  Spearman vs Pearson:  {corr_spear_pear:.4f}')
    print(f'  BICOR vs Pearson:     {corr_bicor_pear:.4f}')
    print()

    print(f'\n{"="*80}')
    print(f'TEST SUMMARY')
    print(f'{"="*80}')
    
    all_valid = all([is_valid_spearman, is_valid_bicor, is_valid_pearson, is_valid_robust])
    status = 'OK ALL TESTS PASSED!' if all_valid else 'NOT OK SOME TESTS FAILED!'
    
    print(f'\nValidation results:')
    print(f'  Spearman:  {"OK VALID" if is_valid_spearman else "NOT OK INVALID"}')
    print(f'  BICOR:     {"OK VALID" if is_valid_bicor else "✗´NOT OK INVALID"}')
    print(f'  Pearson:   {"OK VALID" if is_valid_pearson else "NOT OK INVALID"}')
    print(f'  Robust:    {"OK VALID" if is_valid_robust else "NOT OK INVALID"}')
    
    print(f'\nFiles created:')
    print(f'  OK test_run_spearman.pkl')
    print(f'  OK test_run_bicor.pkl')
    print(f'  OK test_run_pearson.pkl')
    print(f'  OK test_run_robust.pkl')
    print(f'  OK metric_comparison_test_run.png')
    print(f'  OK analysis_log_test_run.txt')
    
    print(f'\nFinal memory:')
    resources_final = get_resource_usage()
    print(f'  {resources_final["memory_gb"]:.2f} GB / {psutil.virtual_memory().total / (1024**3):.1f} GB ({resources_final["memory_percent"]:.1f}%)')
    
    print(f'\nFinal status: {status}')
    print(f'{"="*80}\n')
    
    #cleanup
    cleanup_memory(TEST_GROUP)
    del adata_test, adata_test_proc, X_test, spearman_test, bicor_test, pearson_test, robust_test

else:
    print(f'TEST_MODE = False, skipping tests\n')


TEST_MODE = False, skipping tests



In [34]:
print(f'STEP 1: DELETING PICKLE FILES\n')

pickle_files = ['spearman', 'bicor', 'pearson', 'robust']
total_freed = 0

for metric in pickle_files:
    pkl_path = PHASE3_DIR / 'correlation_metrics' / f'test_run_{metric}.pkl'
    
    if pkl_path.exists():
        file_size_mb = pkl_path.stat().st_size / (1024**2)
        pkl_path.unlink()
        print(f'  ✓ Deleted: {pkl_path.name:35} ({file_size_mb:6.2f} MB)')
        total_freed += file_size_mb
    else:
        print(f'  - Not found: {pkl_path.name:35}')

print(f'\n  Total pickle files freed: {total_freed:.2f} MB\n')

print(f'STEP 2: DELETING PNG PLOT\n')

png_path = PHASE3_DIR / 'correlation_metrics' / 'metric_comparison_test_run.png'

if png_path.exists():
    png_size_mb = png_path.stat().st_size / (1024**2)
    png_path.unlink()
    print(f'  OK Deleted: {png_path.name:35} ({png_size_mb:6.2f} MB)')
    total_freed += png_size_mb
else:
    print(f'  - Not found: {png_path.name:35}')

print(f'\n  Total PNG freed: {png_size_mb:.2f} MB\n')

print(f'STEP 3: GARBAGE COLLECTION\n')

gc.collect()
resources_after_cleanup = get_resource_usage()

print(f'   Garbage collection complete')
print(f'   Memory after cleanup: {resources_after_cleanup["memory_gb"]:.2f} GB / {psutil.virtual_memory().total / (1024**3):.1f} GB ({resources_after_cleanup["memory_percent"]:.1f}%)\n')

print(f'STEP 4: VERIFY CLEANUP\n')

files_remaining = {
    'Pickle files': len(list(PHASE3_DIR.glob('correlation_metrics/test_run_*.pkl'))),
    'PNG files': len(list(PHASE3_DIR.glob('correlation_metrics/metric_comparison_test_run.png'))),
    'Log files': len(list(PHASE3_DIR.glob('correlation_metrics/analysis_log_test_run.txt')))
}

print(f'  Remaining test files:')
print(f'    Pickle files: {files_remaining["Pickle files"]} (should be 0)')
print(f'    PNG files: {files_remaining["PNG files"]} (should be 0)')
print(f'    Log files: {files_remaining["Log files"]} (should be 1) ← kept for reference')

print(f'CLEANUP SUMMARY')

print(f'\nSpace freed: {total_freed:.2f} MB')
print(f'Memory available: {psutil.virtual_memory().available / (1024**3):.2f} GB')
print(f'\n All test files deleted successfully!')
print(f'Log file preserved at: {PHASE3_DIR / "correlation_metrics" / "analysis_log_test_run.txt"}')


STEP 1: DELETING PICKLE FILES

  ✓ Deleted: test_run_spearman.pkl               (  0.05 MB)
  ✓ Deleted: test_run_bicor.pkl                  (  0.05 MB)
  ✓ Deleted: test_run_pearson.pkl                (  0.05 MB)
  ✓ Deleted: test_run_robust.pkl                 (  0.05 MB)

  Total pickle files freed: 0.21 MB

STEP 2: DELETING PNG PLOT

  OK Deleted: metric_comparison_test_run.png      (  0.48 MB)

  Total PNG freed: 0.48 MB

STEP 3: GARBAGE COLLECTION

   Garbage collection complete
   Memory after cleanup: 12.30 GB / 31.2 GB (39.4%)

STEP 4: VERIFY CLEANUP

  Remaining test files:
    Pickle files: 0 (should be 0)
    PNG files: 0 (should be 0)
    Log files: 1 (should be 1) ← kept for reference
CLEANUP SUMMARY

Space freed: 0.69 MB
Memory available: 15.96 GB

 All test files deleted successfully!
Log file preserved at: /triumvirate/home/alexarol/breast_cancer_analysis/results/phase3_networks_refactored/correlation_metrics/analysis_log_test_run.txt


## SUBSECTION 3.2: ANALYZE FIRST SAMPLE - TRIPLENEGATIVE

workflow overview:
- this cell analyzes the TripleNegative breast cancer epithelial cells group
- using pre-processed, HVG-selected data (generated in earlier preprocessing step).
- it calculates three independent correlation metrics (Spearman, BICOR, Pearson),
- validates each matrix, combines them via weighted averaging to create a robust
- consensus correlation matrix, generates diagnostic plots, and saves all outputs.

- expected runtime: ~4-5 hours (shorter because preprocessing already done)
- expected outputs: 4 pickle files + 1 comparison plot + 1 log file per group

WHY this design:
- single metrics can be fooled by outliers or miss non-linear patterns.
- three metrics capture different signal types: rank-based (Spearman),
- weighted-linear (BICOR), and linear (Pearson). if all 3 agree → high confidence.

MEMORY strategy:
- load preprocessed file from disk (avoids re-processing)
- delete intermediate variables after each metric (save to disk first)
- keep only final robust matrix in memory (other 3 deleted to save RAM)
- reload from disk only when combining robustly


In [35]:
#setting group to analyze (CHANGE ONLY THIS LINE FOR DIFFERENT GROUPS)
GROUP_NAME = 'TripleNegative'

print(f'analyzing group: {GROUP_NAME}')
print(f'start time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

#initialized log file for this group
log_path = PHASE3_DIR / 'correlation_metrics' / f'analysis_log_{GROUP_NAME.lower()}.txt'
with open(log_path, 'w') as f:
    f.write(f'ANALYSIS LOG: {GROUP_NAME}\n')
    f.write(f'started: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')

log_to_file(f'phase 3 analysis initiated for {GROUP_NAME}', GROUP_NAME)

print(f'STEP 1: LOAD PREPROCESSED DATA')

#loading preprocessed, HVG-selected dataset for this group.
#preprocessing already applied: normalized to 10k UMI, log-transformed, 3000 HVGs selected.
#data format: sparse matrix of log-transformed expression (cells × genes)
#file created in earlier preprocessing step: adata_{group}_hvg_processed.h5ad
#expected: 7,561 cells for TripleNegative (varies by group)

preprocessed_file = PHASE3_DIR / f'adata_{GROUP_NAME.lower()}_hvg_processed.h5ad'

if not preprocessed_file.exists():
    raise FileNotFoundError(f'preprocessed file not found: {preprocessed_file}')

adata = sc.read_h5ad(preprocessed_file)
print(f'OK loaded preprocessed dataset from: {preprocessed_file.name}')
print(f'  cells: {adata.n_obs:,}')
print(f'  HVGs: {adata.n_vars:,}')
print(f'  data format: {type(adata.X).__name__}')
print(f'  data already: normalized (10k UMI), log-transformed, HVG-selected\n')

log_to_file(f'loaded preprocessed data: {adata.n_obs:,} cells × {adata.n_vars:,} HVGs', GROUP_NAME)

#monitoring resource usage before conversion to dense
resources_start = get_resource_usage()
print(f'  memory before dense conversion: {resources_start["memory_gb"]:.2f} GB / {psutil.virtual_memory().total / (1024**3):.1f} GB\n')

print(f'STEP 2: CONVERT TO DENSE MATRIX')

#converting sparse matrix to dense format required for pairwise correlations.
#WHY dense: correlation computation requires accessing all cell values
#for each gene pair. sparse matrix format (CSR) is inefficient for this.
#dense conversion happens once, then reused for all 3 metric calculations.
#
#memory impact: X_dense will be ~7,561 cells × 3,000 genes × 4 bytes = 90 MB
#(acceptable tradeoff for ~100x faster correlation computation)

X_dense = adata.X.toarray()

print(f'OK converted sparse matrix to dense')
print(f'  shape: {X_dense.shape} (cells × genes)')
print(f'  data type: {X_dense.dtype}')
print(f'  memory: {X_dense.nbytes / (1024**2):.1f} MB')
print(f'  value range: [{X_dense.min():.4f}, {X_dense.max():.4f}]\n')

resources_after_dense = get_resource_usage()
print(f'  memory after dense conversion: {resources_after_dense["memory_gb"]:.2f} GB\n')

log_to_file(f'dense matrix: {X_dense.shape}, {X_dense.nbytes / (1024**2):.1f} MB', GROUP_NAME)

print(f'STEP 3: CALCULATE SPEARMAN RANK CORRELATION')

#Spearman correlation: rank-based, robust to outliers and non-linear monotonic patterns.
#
#WHY Spearman for genomics:
#- gene expression is right-skewed (some cells express genes at extreme levels)
#- rank transformation removes effect of extreme values (outlier-robust)
#- captures monotonic relationships: A increases → B increases (even if non-linear)
#- proven standard in WGCNA and genomics co-expression literature
#
#computation: for each pair of genes (i,j), rank cell values independently,
#then compute Pearson correlation on the ranks (not raw values).
#
#output: n_genes × n_genes correlation matrix
#- diagonal = 1.0 (gene perfectly correlated with itself)
#- off-diagonal ∈ [-1, +1] (correlation strength and direction)

start_time = time.time()
spearman = calculate_spearman_correlation(X_dense, group_name=GROUP_NAME)
spearman_time = time.time() - start_time

print(f'\nOK Spearman calculation complete ({spearman_time/60:.1f} minutes)\n')

#validating Spearman matrix
print(f'validating Spearman matrix')
is_valid_spearman = inspect_correlation_matrix(spearman, 'Spearman', GROUP_NAME, n_samples=10)

#saving to disk immediately (checkpoint recovery if interrupted)
save_metric_intermediate(spearman, GROUP_NAME, 'spearman')

#deleting Spearman from memory after saving to disk.
#WHY delete: we don't need it until Step 6 (robust combination).
#reloading from disk is fast, but keeping 3 matrices in RAM = ~300 MB wasted.
#deleting frees memory for next metric computation (BICOR is slower).
del spearman
gc.collect()

log_to_file(f'Spearman: {spearman_time/60:.1f} minutes, valid={is_valid_spearman}', GROUP_NAME)

print(f'STEP 4: CALCULATE BICOR (BIWEIGHT MIDCORRELATION)')

#BICOR: weighted Pearson correlation using median and MAD (median absolute deviation).
#
#WHY BICOR for genomics:
#- combines robustness of rank-based methods with linearity preservation of Pearson
#- uses median/MAD instead of mean/SD (more stable for skewed gene expression)
#- smooth downweighting of outliers via biweight function (not harsh removal)
#- designed specifically for genomics co-expression by WGCNA authors
#- captures linear patterns that Spearman may miss on continuous scales
#
#computation: for each gene, compute median (med_i) and MAD (mad_i).
#then for each pair (i,j), standardize using med/MAD instead of mean/SD.
#apply biweight smoothing: points far from median get lower weight gradually.
#compute weighted Pearson on original (not ranked) values.
#
#output: same structure as Spearman (n_genes × n_genes correlation matrix)

start_time = time.time()
bicor = calculate_bicor_correlation(X_dense, group_name=GROUP_NAME)
bicor_time = time.time() - start_time

print(f'\nOK BICOR calculation complete ({bicor_time/60:.1f} minutes)\n')

#validating BICOR matrix
print(f'validating BICOR matrix')
is_valid_bicor = inspect_correlation_matrix(bicor, 'BICOR', GROUP_NAME, n_samples=10)

#saving to disk
save_metric_intermediate(bicor, GROUP_NAME, 'bicor')

#deleting BICOR from memory (same reasoning as Spearman)
del bicor
gc.collect()

log_to_file(f'BICOR: {bicor_time/60:.1f} minutes, valid={is_valid_bicor}', GROUP_NAME)

print(f'STEP 5: CALCULATE PEARSON CORRELATION')

#Pearson correlation: linear correlation coefficient (standard approach).
#
#WHY Pearson for validation:
#- baseline comparison metric (industry standard, well-understood)
#- fast to compute (vectorized NumPy operation on dense matrix)
#- less robust than Spearman/BICOR (sensitive to outliers)
#- useful for validation: how much do robust methods improve over standard?
#
#computation: for each gene pair (i,j), compute Pearson r on raw expression values.
#r = covariance(i,j) / (std_i * std_j)
#assumes linear relationship, no downweighting of outliers.
#
#output: n_genes × n_genes correlation matrix
#
#NOTE: weighted lower in robust combination (0.25 vs 0.40 Spearman)
#because it's more susceptible to outlier influence in skewed expression data.

start_time = time.time()
pearson = calculate_pearson_correlation(X_dense, group_name=GROUP_NAME)
pearson_time = time.time() - start_time

print(f'\nOK Pearson calculation complete ({pearson_time/60:.1f} minutes)\n')

#validating Pearson matrix
print(f'validating Pearson matrix')
is_valid_pearson = inspect_correlation_matrix(pearson, 'Pearson', GROUP_NAME, n_samples=10)

#saving to disk
save_metric_intermediate(pearson, GROUP_NAME, 'pearson')

#deleting Pearson from memory
del pearson
gc.collect()

log_to_file(f'Pearson: {pearson_time/60:.1f} minutes, valid={is_valid_pearson}', GROUP_NAME)

print(f'STEP 6: CREATE ROBUST CONSENSUS CORRELATION')

#reloading all three metrics from disk (memory-efficient approach).
#instead of keeping all 3 matrices in RAM simultaneously (~300 MB),
#we kept only one at a time during computation. now we need all 3 to combine.
#disk I/O is fast enough that reload time is negligible compared to computation.

spearman = load_metric_intermediate(GROUP_NAME, 'spearman')
bicor = load_metric_intermediate(GROUP_NAME, 'bicor')
pearson = load_metric_intermediate(GROUP_NAME, 'pearson')

print(f'OK reloaded all three metrics from disk\n')

#weighted combination of the three metrics into single robust consensus.
#weights chosen based on: (1) robustness to outliers, (2) genomics relevance.
#
#weight assignment:
#  0.40 × Spearman  ← highest weight (rank-based, most outlier-robust)
#  0.35 × BICOR     ← high weight (genomics-specific, linear-robust hybrid)
#  0.25 × Pearson   ← lower weight (less robust to outliers, validation check)
#
#sum = 1.00 (weights normalized)
#
#CONSENSUS LOGIC:
#- if all 3 metrics agree on strong correlation → robust value is strong
#- if metrics disagree (e.g., Spearman high, Pearson low) → robust value is downweighted
#- this automatic downweighting protects against outlier-driven false correlations
#
#biological interpretation:
#- high robust value (>0.70) = gene pair co-expressed across methods = likely real
#- medium robust value (0.40-0.70) = some disagreement = uncertain
#- low robust value (<0.40) = metrics disagree = noise or condition-specific
#
#this approach is standard in WGCNA literature and proven effective in genomics.

robust = combine_correlations_robust(spearman, bicor, pearson)

print(f'OK robust consensus created (0.40×S + 0.35×B + 0.25×P)\n')

#validating robust consensus matrix
print(f'validating robust matrix')
is_valid_robust = inspect_correlation_matrix(robust, 'Robust', GROUP_NAME, n_samples=10)

#saving robust matrix to disk (this is the final output we'll use for networks)
save_metric_intermediate(robust, GROUP_NAME, 'robust')

print()

log_to_file(f'robust combination: valid={is_valid_robust}', GROUP_NAME)

print(f'STEP 7: GENERATE DIAGNOSTIC COMPARISON PLOTS')

#generating 4-panel diagnostic comparison figure showing relationships
#between the three metrics and robust consensus. these plots serve as
#quality control and validation of the consensus approach.
#
#panel 1 - Spearman vs BICOR: "different perspectives"
#- if scatter: methods capture different signal aspects = good diversification
#- if tight cluster: methods redundant = less benefit from ensemble
#- expected: moderate scatter because rank-based ≠ weighted-linear
#
#panel 2 - Spearman vs Pearson: "Pearson less stable"
#- if tight: robust methods not adding much value over Pearson
#- if scattered: Pearson has more noise = robust methods improve stability
#- expected: scatter in weak regions, cluster in strong correlations
#
#panel 3 - Distribution comparison: histograms of all 3 metrics
#- tight peak = method is confident (most gene pairs uncorrelated)
#- wide tail = method detects correlations across spectrum
#- expected: Spearman/BICOR tight (robust), Pearson wider (less stable)
#
#panel 4 - Robust vs Spearman: "balances three perspectives"
#- if tight cluster on diagonal: robust averaging preserves information
#- if scattered: robust averaging loses important signal
#- expected: tight cluster (proves weighted averaging works)
#
#files saved: metric_comparison_{group_name}.png (diagnostic file)

generate_metric_comparison_plots(GROUP_NAME, spearman, bicor, pearson, robust)

log_to_file(f'diagnostic plots generated', GROUP_NAME)

print(f'STEP 8: FINAL SUMMARY AND CLEANUP')

#computing total analysis time
total_compute_time = spearman_time + bicor_time + pearson_time
total_hours = total_compute_time / 3600

#validation summary
all_valid = all([is_valid_spearman, is_valid_bicor, is_valid_pearson, is_valid_robust])
validation_status = 'OK ALL METRICS VALID' if all_valid else 'NOT OK VALIDATION FAILED'

#final resource usage
resources_final = get_resource_usage()
memory_used_gb = resources_final['memory_gb']
memory_total_gb = psutil.virtual_memory().total / (1024**3)
memory_percent = resources_final['memory_percent']

#printing comprehensive summary
print(f'ANALYSIS COMPLETE: {GROUP_NAME}\n')
print(f'  cells analyzed:           {adata.n_obs:,}')
print(f'  HVGs used:                {X_dense.shape[1]:,}')
print(f'  correlation pairs:        {X_dense.shape[1] * (X_dense.shape[1] - 1) // 2:,}')
print(f'\n  timing breakdown:')
print(f'    Spearman:               {spearman_time/60:7.1f} minutes')
print(f'    BICOR:                  {bicor_time/60:7.1f} minutes')
print(f'    Pearson:                {pearson_time/60:7.1f} minutes')
print(f'    total:                  {total_hours:7.1f} hours')
print(f'\n  validation results:')
print(f'    Spearman:               {" VALID" if is_valid_spearman else " INVALID"}')
print(f'    BICOR:                  {" VALID" if is_valid_bicor else " INVALID"}')
print(f'    Pearson:                {" VALID" if is_valid_pearson else " INVALID"}')
print(f'    Robust:                 {" VALID" if is_valid_robust else " INVALID"}')
print(f'    overall:                {validation_status}')
print(f'\n  memory usage:')
print(f'    current:                {memory_used_gb:.2f} GB / {memory_total_gb:.1f} GB ({memory_percent:.1f}%)')
print(f'\n  output files saved:')
print(f'    ✓ {GROUP_NAME.lower()}_spearman.pkl')
print(f'    ✓ {GROUP_NAME.lower()}_bicor.pkl')
print(f'    ✓ {GROUP_NAME.lower()}_pearson.pkl')
print(f'    ✓ {GROUP_NAME.lower()}_robust.pkl')
print(f'    ✓ metric_comparison_{GROUP_NAME.lower()}.png')
print(f'    ✓ analysis_log_{GROUP_NAME.lower()}.txt\n')

#logging final summary
log_to_file(f'analysis complete', GROUP_NAME)
log_to_file(f'total time: {total_hours:.1f} hours', GROUP_NAME)
log_to_file(f'validation: {validation_status}', GROUP_NAME)
log_to_file(f'memory: {memory_used_gb:.2f} GB / {memory_total_gb:.1f} GB', GROUP_NAME)

#cleanup: delete all intermediate matrices and data from memory
del spearman, bicor, pearson, robust, adata, X_dense
gc.collect()

print(f'OK memory cleaned up')
print(f'OK ready for next group analysis\n')

log_to_file(f'memory cleanup complete', GROUP_NAME)


analyzing group: TripleNegative
start time: 2025-12-10 13:55:00
STEP 1: LOAD PREPROCESSED DATA
OK loaded preprocessed dataset from: adata_triplenegative_hvg_processed.h5ad
  cells: 7,514
  HVGs: 3,000
  data format: csr_matrix
  data already: normalized (10k UMI), log-transformed, HVG-selected

  memory before dense conversion: 12.30 GB / 31.2 GB

STEP 2: CONVERT TO DENSE MATRIX
OK converted sparse matrix to dense
  shape: (7514, 3000) (cells × genes)
  data type: float32
  memory: 86.0 MB
  value range: [0.0000, 8.5104]

  memory after dense conversion: 12.39 GB

STEP 3: CALCULATE SPEARMAN RANK CORRELATION

  calculating Spearman correlations for 3,000 genes...
    OK Spearman complete (70.7 minutes, 4,498,500 pairs)0/4,498,500 (100.0%) | 1061.07 pairs/sec | ETA:    0.0m


OK Spearman calculation complete (70.7 minutes)

validating Spearman matrix

MATRIX INSPECTION: SPEARMAN
  Shape:           3,000 × 3,000 genes
  Memory:          68.7 MB
  Diagonal:        mean=1.0000 (should be 1.

In [37]:
# ==============================================================================
# DIAGNOSTIC CELL - SIMPLIFIED v2: BICOR INVESTIGATION (NO ADATA NEEDED)
# ==============================================================================
#simplified version that focuses on the key question:
#is BICOR matrix mostly zeros or does it have normal distribution?
#
#note: this version does NOT reference adata (gene names)
#because adata was deleted after analysis. we just look at numeric correlations.

import numpy as np
import pickle
from pathlib import Path

print(f'\n{"="*80}')
print(f'{"DIAGNOSTIC: BICOR INVESTIGATION (SIMPLIFIED V2)":^80}')
print(f'{"="*80}\n')

GROUP_NAME = 'TripleNegative'
correlation_dir = PHASE3_DIR / 'correlation_metrics'

# ==============================================================================
# LOAD MATRICES
# ==============================================================================

print(f'loading matrices...\n')

with open(correlation_dir / f'{GROUP_NAME.lower()}_spearman.pkl', 'rb') as f:
    spearman = pickle.load(f)

with open(correlation_dir / f'{GROUP_NAME.lower()}_bicor.pkl', 'rb') as f:
    bicor = pickle.load(f)

with open(correlation_dir / f'{GROUP_NAME.lower()}_pearson.pkl', 'rb') as f:
    pearson = pickle.load(f)

print(f'✓ loaded all matrices\n')

# ==============================================================================
# EXTRACT UPPER TRIANGLES (EXCLUDE DIAGONAL)
# ==============================================================================

mask = np.triu(np.ones_like(bicor, dtype=bool), k=1)

sp_upper = spearman[mask]
bi_upper = bicor[mask]
pe_upper = pearson[mask]

print(f'upper triangle size: {len(bi_upper):,} values\n')

# ==============================================================================
# KEY QUESTION: IS BICOR MOSTLY ZEROS?
# ==============================================================================

print(f'{"─"*80}')
print(f'KEY QUESTION: Is BICOR mostly zeros?\n')

bicor_zeros = (bi_upper == 0).sum()
bicor_zeros_pct = bicor_zeros / len(bi_upper) * 100

print(f'BICOR analysis:')
print(f'  total values:      {len(bi_upper):,}')
print(f'  zeros:             {bicor_zeros:,} ({bicor_zeros_pct:.2f}%)')
print(f'  non-zeros:         {len(bi_upper) - bicor_zeros:,} ({100 - bicor_zeros_pct:.2f}%)')
print()

if bicor_zeros_pct > 50:
    print(f'⚠️  WARNING: BICOR is {bicor_zeros_pct:.1f}% zeros!')
    print(f'   this is ABNORMAL and indicates a problem.\n')
elif bicor_zeros_pct > 10:
    print(f'⚠️  CAUTION: BICOR is {bicor_zeros_pct:.1f}% zeros')
    print(f'   higher than expected but not critical.\n')
else:
    print(f'✓ BICOR zeros are normal ({bicor_zeros_pct:.2f}%)\n')

# ==============================================================================
# COMPARE DISTRIBUTIONS
# ==============================================================================

print(f'{"─"*80}')
print(f'DISTRIBUTION COMPARISON\n')

print(f'Spearman:')
print(f'  mean:        {sp_upper.mean():+.6f}')
print(f'  median:      {np.median(sp_upper):+.6f}')
print(f'  std:         {sp_upper.std():.6f}')
print(f'  min:         {sp_upper.min():+.6f}')
print(f'  max:         {sp_upper.max():+.6f}')
print(f'  % zeros:     {(sp_upper == 0).sum() / len(sp_upper) * 100:.2f}%')
print()

print(f'BICOR:')
print(f'  mean:        {bi_upper.mean():+.6f}')
print(f'  median:      {np.median(bi_upper):+.6f}')
print(f'  std:         {bi_upper.std():.6f}')
print(f'  min:         {bi_upper.min():+.6f}')
print(f'  max:         {bi_upper.max():+.6f}')
print(f'  % zeros:     {(bi_upper == 0).sum() / len(bi_upper) * 100:.2f}%')
print()

print(f'Pearson:')
print(f'  mean:        {pe_upper.mean():+.6f}')
print(f'  median:      {np.median(pe_upper):+.6f}')
print(f'  std:         {pe_upper.std():.6f}')
print(f'  min:         {pe_upper.min():+.6f}')
print(f'  max:         {pe_upper.max():+.6f}')
print(f'  % zeros:     {(pe_upper == 0).sum() / len(pe_upper) * 100:.2f}%')
print()

# ==============================================================================
# METHOD AGREEMENT
# ==============================================================================

print(f'{"─"*80}')
print(f'METHOD AGREEMENT\n')

#removed NaN/Inf from comparison to avoid warnings
valid_sp_bi = ~(np.isnan(sp_upper) | np.isnan(bi_upper) | np.isinf(sp_upper) | np.isinf(bi_upper))
valid_sp_pe = ~(np.isnan(sp_upper) | np.isnan(pe_upper) | np.isinf(sp_upper) | np.isinf(pe_upper))
valid_bi_pe = ~(np.isnan(bi_upper) | np.isnan(pe_upper) | np.isinf(bi_upper) | np.isinf(pe_upper))

corr_sp_bi = np.corrcoef(sp_upper[valid_sp_bi], bi_upper[valid_sp_bi])[0, 1]
corr_sp_pe = np.corrcoef(sp_upper[valid_sp_pe], pe_upper[valid_sp_pe])[0, 1]
corr_bi_pe = np.corrcoef(bi_upper[valid_bi_pe], pe_upper[valid_bi_pe])[0, 1]

print(f'Correlation between methods:')
print(f'  Spearman vs BICOR:   {corr_sp_bi:+.6f}')
print(f'  Spearman vs Pearson: {corr_sp_pe:+.6f}')
print(f'  BICOR vs Pearson:    {corr_bi_pe:+.6f}')
print()

if corr_sp_bi > 0.80:
    print(f'✓ BICOR agrees well with Spearman ({corr_sp_bi:.4f})')
elif corr_sp_bi > 0.60:
    print(f'⚠️  BICOR shows moderate agreement with Spearman ({corr_sp_bi:.4f})')
else:
    print(f'✗ BICOR poorly agrees with Spearman ({corr_sp_bi:.4f}) - PROBLEM!')
print()

# ==============================================================================
# SHOW ACTUAL VALUES (NOT JUST ZEROS)
# ==============================================================================

print(f'{"─"*80}')
print(f'SAMPLE CORRELATION VALUES\n')

#get random non-zero BICOR values
nonzero_mask = bi_upper != 0
nonzero_indices = np.where(nonzero_mask)[0]

if len(nonzero_indices) > 10:
    sample_idx = np.random.choice(nonzero_indices, 10, replace=False)
    print(f'10 random non-zero BICOR values:')
    for rank, idx in enumerate(sample_idx, 1):
        print(f'  {rank:2d}. BICOR={bi_upper[idx]:+.6f}, Spearman={sp_upper[idx]:+.6f}, Pearson={pe_upper[idx]:+.6f}')
    print()
else:
    if len(nonzero_indices) == 0:
        print(f'✗ ERROR: BICOR matrix is completely zeros!')
        print(f'  no non-zero values found')
    else:
        print(f'⚠️  only {len(nonzero_indices)} non-zero BICOR values exist')
        print(f'showing all non-zero values:')
        for rank, idx in enumerate(nonzero_indices[:10], 1):
            print(f'  {rank:2d}. BICOR={bi_upper[idx]:+.6f}, Spearman={sp_upper[idx]:+.6f}, Pearson={pe_upper[idx]:+.6f}')
    print()

# ==============================================================================
# FIND HIGH-CORRELATION PAIRS
# ==============================================================================

print(f'{"─"*80}')
print(f'TOP 5 HIGHEST CORRELATIONS BY METHOD\n')

#top by Spearman
top5_sp_idx = np.argsort(sp_upper)[-5:][::-1]
print(f'Top 5 by Spearman:')
for rank, idx in enumerate(top5_sp_idx, 1):
    print(f'  {rank}. Spearman={sp_upper[idx]:+.6f}, BICOR={bi_upper[idx]:+.6f}, Pearson={pe_upper[idx]:+.6f}')
print()

#top by BICOR
top5_bi_idx = np.argsort(bi_upper)[-5:][::-1]
print(f'Top 5 by BICOR:')
for rank, idx in enumerate(top5_bi_idx, 1):
    print(f'  {rank}. BICOR={bi_upper[idx]:+.6f}, Spearman={sp_upper[idx]:+.6f}, Pearson={pe_upper[idx]:+.6f}')
print()

#top by Pearson
top5_pe_idx = np.argsort(pe_upper)[-5:][::-1]
print(f'Top 5 by Pearson:')
for rank, idx in enumerate(top5_pe_idx, 1):
    print(f'  {rank}. Pearson={pe_upper[idx]:+.6f}, Spearman={sp_upper[idx]:+.6f}, BICOR={bi_upper[idx]:+.6f}')
print()

# ==============================================================================
# FINAL DIAGNOSIS
# ==============================================================================

print(f'{"─"*80}')
print(f'FINAL DIAGNOSIS\n')

if bicor_zeros_pct > 50:
    print(f'✗ BICOR MATRIX IS BROKEN')
    print(f'  {bicor_zeros_pct:.1f}% values are zero (should be <1%)')
    print(f'  BICOR calculation has a bug that needs fixing')
    print(f'  ACTION: Fix calculate_bicor_correlation() and re-run TripleNegative')
elif corr_sp_bi < 0.70:
    print(f'⚠️  BICOR RESULTS ARE SUSPICIOUS')
    print(f'  correlation with Spearman is {corr_sp_bi:.4f} (should be >0.85)')
    print(f'  methods disagree significantly')
    print(f'  ACTION: Investigate BICOR function implementation')
elif len(nonzero_indices) == 0:
    print(f'✗ BICOR MATRIX IS COMPLETELY ZERO')
    print(f'  no non-zero correlations found')
    print(f'  ACTION: This is a critical bug, fix and re-run')
else:
    print(f'✓ BICOR MATRIX APPEARS VALID')
    print(f'  zeros: {bicor_zeros_pct:.2f}% (normal)')
    print(f'  agreement with Spearman: {corr_sp_bi:.4f} (good)')
    print(f'  ACTION: Proceed with remaining 5 groups')

print(f'\n{"="*80}\n')

# ==============================================================================
# END OF DIAGNOSTIC
# ==============================================================================


                DIAGNOSTIC: BICOR INVESTIGATION (SIMPLIFIED V2)                 

loading matrices...

✓ loaded all matrices

upper triangle size: 4,498,500 values

────────────────────────────────────────────────────────────────────────────────
KEY QUESTION: Is BICOR mostly zeros?

BICOR analysis:
  total values:      4,498,500
  zeros:             4,494,035 (99.90%)
  non-zeros:         4,465 (0.10%)

   this is ABNORMAL and indicates a problem.

────────────────────────────────────────────────────────────────────────────────
DISTRIBUTION COMPARISON

Spearman:
  mean:        +0.020614
  median:      +0.009852
  std:         0.042631
  min:         -0.494386
  max:         +0.837591
  % zeros:     0.00%

BICOR:
  mean:        +0.000039
  median:      +0.000000
  std:         0.004782
  min:         -0.372104
  max:         +0.714210
  % zeros:     99.90%

Pearson:
  mean:        +0.003713
  median:      -0.001561
  std:         0.031226
  min:         -0.575845
  max:         +0.9857

## SECTION 3.2: ROBUST CORRELATION CALCULATION

- PURPOSE: calculated robust correlations (Pearson, Spearman, Kendall) for all
- 6 patient groups, with filtering, progress monitoring, and validation

- INPUT: processed_datasets (dict with 6 AnnData objects)
- OUTPUT: robust_correlations (dict with correlation matrices per group)

why several correlations?

If we used only Spearman (as earlier)):

    Gene A vs Gene B might show correlation = 0.7 (strong)

BUT what if:
  - One outlier cell inflates the correlation?
  - The relationship isn't truly monotonic?
  - Random noise coincidentally aligns?
Single metric = single point of failure! 

| Metric   | Detects                            | Robust To                      | Example                       |
| -------- | ---------------------------------- | ------------------------------ | ----------------------------- |
| Spearman | Monotonic rank relationships       | Outliers (somewhat)            | Most genes rise/fall together |
| Kendall  | Pairwise concordance (ranks agree) | Extreme outliers (very robust) | Genes move in same direction  |
| Pearson  | Linear relationships               | Nothing (can be fooled)        | Perfect linear relationship   |


based on these articeles i decided to change my approach as the following code shows: 

    https://doi.org/10.1038/s41598-021-01840-z

    https://doi.org/10.3390/genes10120962

Why averaging them? (from the article, but why i think it is right?)

Robust Correlation = (0.40 × Spearman + 0.35 × Kendall + 0.25 × Pearson)

This means:
- If all 3 agree → strong correlation
- If they disagree → weaker average
- Reduces noise from any single metric
- More reproducible across datasets
- Better for identifying true co-expression

Why these exact values?

0.40 × Spearman    ← HIGHEST (most established in genomics)
   → Most used in scRNA-seq literature
   → Industry standard for co-expression
   → Rank-based (handles sparse data well)

0.35 × Kendall     ← HIGH (very robust)
   → More robust to outliers than Spearman
   → Measure of concordance (pairs moving together)
   → Slightly slower to compute but trustworthy

0.25 × Pearson     ← LOWER (for validation only)
   → Catches linear relationships
   → Can be fooled by outliers
   → But good as sanity check
   → If Pearson agrees with others → strong signal


- Kendall takes too much time and effort, i will try using BIOCOR

why not the same values?

Equal weighting = treating all metrics as equally trustworthy
Weighted = reflecting our knowledge:
   - Spearman most reliable for scRNA-seq
   - Kendall very robust
   - Pearson useful but less robust



In [None]:
#DIAGNOSTIC: Check correlation matrices
print(f'{"CORRELATION MATRIX DIAGNOSTICS":^80}\n')

for group_name, corr_dict in robust_correlations.items():
    spearman = corr_dict['spearman']
    kendall = corr_dict['kendall']
    pearson = corr_dict['pearson']
    
    print(f'{group_name}:')
    print(f'  Spearman shape: {spearman.shape} (should be 3000×3000)')
    print(f'  Spearman diagonal: {np.diag(spearman)[:5]} (should be all 1.0)')
    print(f'  Spearman range: [{spearman.min():.4f}, {spearman.max():.4f}] (should be -1 to 1)')
    print(f'  Spearman NaNs: {np.isnan(spearman).sum()} (should be 0)')
    print(f'  Kendall range: [{kendall.min():.4f}, {kendall.max():.4f}]')
    print(f'  Pearson range: [{pearson.min():.4f}, {pearson.max():.4f}]')
    print()

print(f'OK All correlation matrices look good!\n')


## SUBSECTION 3.3: Metric Comparison Visualization

In [None]:
print(f'generating metric comparison visualizations', end=' ', flush=True)

#selected first group for visualization
group_name = list(robust_correlations.keys())[0]
corrs = robust_correlations[group_name]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

#spearman vs kendall
ax = axes[0, 0]
ax.scatter(corrs['spearman'].flatten(), corrs['kendall'].flatten(), alpha=0.1, s=1)
ax.set_xlabel('Spearman')
ax.set_ylabel('Kendall')
ax.set_title(f'{group_name}: Spearman vs Kendall')

#spearman vs pearson
ax = axes[0, 1]
ax.scatter(corrs['spearman'].flatten(), corrs['pearson'].flatten(), alpha=0.1, s=1)
ax.set_xlabel('Spearman')
ax.set_ylabel('Pearson')
ax.set_title(f'{group_name}: Spearman vs Pearson')

#distribution comparison
ax = axes[1, 0]
ax.hist(corrs['spearman'].flatten(), bins=100, alpha=0.5, label='Spearman')
ax.hist(corrs['kendall'].flatten(), bins=100, alpha=0.5, label='Kendall')
ax.hist(corrs['pearson'].flatten(), bins=100, alpha=0.5, label='Pearson')
ax.set_xlabel('Correlation Value')
ax.set_ylabel('Frequency')
ax.set_title(f'{group_name}: Correlation Distributions')
ax.legend()

#robust vs spearman
ax = axes[1, 1]
ax.scatter(corrs['spearman'].flatten(), corrs['robust'].flatten(), alpha=0.1, s=1)
ax.set_xlabel('Spearman')
ax.set_ylabel('Robust (Weighted)')
ax.set_title(f'{group_name}: Robust vs Spearman')

plt.tight_layout()
out_file = PHASE3_DIR / 'correlation_metrics' / 'metric_comparison.png'
plt.savefig(out_file, dpi=300, bbox_inches='tight')
plt.close()

print(f'OK')

print(f'\nOK Section 3 complete\n')

In [None]:
print(f'CHECKPOINT: Phase 3 Progress')

print(f' SECTION 1: Setup & Data Integration - COMPLETE')
print(f'   - 6 datasets loaded and preprocessed')
print(f'   - Log normalization + HVG filtering applied\n')

print(f' SECTION 2: Exploratory Analysis - COMPLETE')
print(f'   - Sample quality assessment')
print(f'   - Gene detection & zero inflation')
print(f'   - PCA & dimensionality reduction')
print(f'   - Expression distribution')
print(f'   - HVG overlap analysis\n')

print(f' SECTION 3: Robust Correlations - COMPLETE')
print(f'   - Spearman, Kendall, Pearson calculated')
print(f'   - Weighted robust averaging (40/35/25)')
print(f'   - Metric comparison visualizations\n')

print(f' NEXT: Soft power detection (SECTION 4)')
print(f' THEN: Network construction (SECTION 5)')
print(f' THEN: Gene enrichment (SECTION 6)')
print(f' THEN: Expression directionality (SECTION 7)')
print(f' THEN: Hub analysis (SECTION 8)')
print(f' THEN: Network comparison (SECTION 9)')
print(f' THEN: Publication figures (SECTION 10)')
print(f' THEN: Therapeutic targets (SECTION 11)\n')