#Environment Set Up

Creates a Python 3.10 kernel within Colab, clones the repository and installs pinned dependencies. Installs compatible versions of JAX (CUDA) and NumPyro and CUDA JAX and NVIDIA CUDA wheels.
Creates a py310cuda launcher fixed on the correct libraries.

##Directory
Clones the GitHub repository and mounts to Google Drive to store results.

In [None]:
import os, subprocess, sys, stat, textwrap, pathlib, shutil, json

REPO = "/content/sgfa_qmap-pd"

In [16]:
# Clone the repository
os.chdir("/content")
subprocess.run(["rm","-rf","sgfa_qmap-pd"])
subprocess.check_call(["git","clone","https://github.com/meeramads/sgfa_qmap-pd.git"])
os.chdir(REPO)

In [None]:
from google.colab import drive
from pathlib import Path
drive.mount('/content/drive')

In [None]:
drive_path = "/content/drive/MyDrive/sgfa_results"

if os.path.exists(drive_path):
    if os.path.isfile(drive_path):
        print(f"Found a FILE named 'sgfa_results' - renaming it to 'sgfa_results_backup'")
        shutil.move(drive_path, "/content/drive/MyDrive/sgfa_results_backup")
        os.makedirs(drive_path)
    elif os.path.isdir(drive_path):
        print(f"Found existing DIRECTORY 'sgfa_results' - checking contents...")
        contents = list(os.listdir(drive_path))
        if contents:
            print(f"   Contains {len(contents)} items: {contents[:5]}{'...' if len(contents) > 5 else ''}")
            print("   Will preserve existing results and add new ones")
        else:
            print("   Directory is empty - ready to use")
else:
    print("Creating new 'sgfa_results' directory")
    os.makedirs(drive_path)

In [None]:
results_link = "../results"

if os.path.exists(results_link):
    if os.path.islink(results_link):
        print("Removing existing symlink")
        os.unlink(results_link)
    elif os.path.isdir(results_link):
        print("Found existing results directory - backing up first")
        if os.listdir(results_link):
            backup_path = "/content/drive/MyDrive/sgfa_results_local_backup"
            shutil.move(results_link, backup_path)
            print(f"   Moved to {backup_path}")
        else:
            shutil.rmtree(results_link)

In [None]:
print("Creating symlink to Google Drive...")
os.symlink(drive_path, results_link)

In [None]:
print("Setup complete! Testing...")
test_dir = f"{results_link}/test_directory"
os.makedirs(test_dir, exist_ok=True)

In [22]:
with open(f"{test_dir}/test_file.txt", "w") as f:
    f.write("This file should persist across disconnections!")

In [None]:
if os.path.exists(f"{drive_path}/test_directory/test_file.txt"):
    print("SUCCESS: Files will now persist across disconnections!")
    print(f"Results location: {drive_path}")
    print(f"Symlink: {results_link} -> {drive_path}")
else:
    print("ERROR: Symlink setup failed")

In [None]:
shutil.rmtree(test_dir)
print("Cleaned up test files")

##Python and CUDA environment

Must be connected to a GPU runtime. Sets up the Python 3.10 environment, JAX and CUDA libraries to coordinate with GPU usage. **Has to be run before any experiments.**

In [None]:
# Install Python 3.10 side-by-side
subprocess.check_call(["wget","-q","https://github.com/korakot/kora/releases/download/v0.10/py310.sh"])
subprocess.check_call(["bash","./py310.sh","-b","-f","-p","/usr/local"])
subprocess.check_call(["python3.10","-V"])

In [None]:
# Install dependencies
subprocess.check_call(["python3.10","-m","pip","install","-U","pip"])
subprocess.check_call(["python3.10","-m","pip","install","-r","requirements.txt"])

In [None]:
# Set up JAX (CUDA) + NumPyro to ensure compatibility
subprocess.run(["python3.10","-m","pip","uninstall","-y","jax","jaxlib"])
subprocess.check_call([
    "python3.10","-m","pip","install","-U",
    "jax[cuda12_pip]==0.4.20","-f","https://storage.googleapis.com/jax-releases/jax_cuda_releases.html"
])
subprocess.check_call(["python3.10","-m","pip","install","numpyro==0.13.2"])

In [None]:
# NVIDIA CUDA libraries into the *py310* site-packages
subprocess.check_call(["python3.10","-m","pip","install","-q",
    "nvidia-cudnn-cu12>=8.9,<9",
    "nvidia-cublas-cu12>=12.2",
    "nvidia-cuda-runtime-cu12>=12.2",
    "nvidia-cusolver-cu12>=11.4",
    "nvidia-cusparse-cu12>=12.1",
    "nvidia-cufft-cu12>=11.0",
    "nvidia-cuda-cupti-cu12>=12.2",
    "nvidia-nvjitlink-cu12>=12.2",
    "nvidia-nccl-cu12>=2.18",
])

In [29]:
# Build LD_LIBRARY_PATH for those wheels and write a launcher
py310_site = subprocess.check_output(
    ["python3.10","-c","import site; print(site.getsitepackages()[0])"],
    text=True
).strip()
subdirs = ["cudnn/lib","cublas/lib","cufft/lib","cusolver/lib","cusparse/lib",
           "cuda_runtime/lib","cuda_cupti/lib","nvjitlink/lib","nccl/lib"]
lib_paths = [os.path.join(py310_site,"nvidia",d) for d in subdirs]
lib_paths = [p for p in lib_paths if os.path.isdir(p)]
LD = ":".join(lib_paths)

wrapper = "/usr/local/bin/py310cuda"
pathlib.Path(wrapper).write_text(textwrap.dedent(f"""\
#!/bin/bash
export LD_LIBRARY_PATH="{LD}:$LD_LIBRARY_PATH"
export XLA_PYTHON_CLIENT_PREALLOCATE=false
export XLA_PYTHON_CLIENT_MEM_FRACTION=0.70
export JAX_PLATFORM_NAME=gpu
exec python3.10 "$@"
"""))
os.chmod(wrapper, os.stat(wrapper).st_mode | stat.S_IEXEC)

In [None]:
# Verify JAX sees the GPU
subprocess.check_call([
    "py310cuda","-c",
    "import jax; print('backend:', jax.lib.xla_bridge.get_backend().platform, '| devices:', jax.devices())"
])

In [None]:
# Double check
!py310cuda -c "import jax; print('backend:', jax.lib.xla_bridge.get_backend().platform, '| devices:', jax.devices())"

---

# Training the model

Call  ```!py310cuda run_analysis.py``` with the flag ```--device gpu```.

Run ```!py310cuda run_analysis.py --help || py310cuda run_analysis.py -h``` for detailed information on other available flags.

In [None]:
!git pull

In [None]:
subprocess.run(["rm","-rf","results"])

In [None]:
!ls

In [None]:
# Lightweight smoke test for synthetic data
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 3 \
  --num-samples 100 \
  --num-warmup 50 \
  --num-chains 1 \
  --num-runs 1 \
  --percW 33 \
  --seed 42 \
  --device gpu

In [None]:
# Preprocessing smoke test
!py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 3 \
      --num-samples 100 \
      --num-warmup 50 \
      --num-chains 1 \
      --num-runs 1 \
      --enable_preprocessing \
      --feature_selection variance \
      --n_top_features 100 \
      --device gpu \
      --seed 42

In [None]:
# Cross-validation smoke test
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 3 \
  --num-samples 50 \
  --num-warmup 25 \
  --num-chains 1 \
  --cv_only \
  --cv_folds 3 \
  --device gpu \
  --seed 42

In [None]:
# Lightweight smoke test for qMAP-PD data
!py310cuda run_analysis.py \
  --dataset qmap_pd \
  --K 3 \
  --num-samples 100 \
  --num-warmup 50 \
  --num-chains 1 \
  --num-runs 1 \
  --device gpu

In [None]:
# Factor recovery demonstration
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 5 \
  --num-samples 1000 \
  --num-warmup 500 \
  --num-chains 2 \
  --num-runs 3 \
  --percW 33 \
  --device gpu \
  --seed 42

In [None]:
# Sparsity level comparison
sparsity_levels = [25, 50, 75]
results_summary = []

for percW in sparsity_levels:
    print(f"\n Running with {percW}% sparsity...")

    result = subprocess.run([
        "py310cuda", "run_analysis.py",
        "--dataset", "synthetic",
        "--K", "5",
        "--num-samples", "800",
        "--num-warmup", "400",
        "--num-chains", "2",
        "--num-runs", "2",
        "--percW", str(percW),
        "--device", "gpu",
        "--seed", "42"
    ], capture_output=True, text=True)

    if result.returncode == 0:
        print(f"{percW}% sparsity completed successfully")
        results_summary.append(f"{percW}% sparsity: Success")
    else:
        print(f"{percW}% sparsity failed")
        results_summary.append(f"{percW}% sparsity: Failed")

print("\n Sparsity Comparison Summary:")
for result in results_summary:
    print(f"   {result}")
print(" Check ../results/synthetic/ folders for comparison plots")


In [None]:
# Basic qMAP-PD analysis
try:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 15 \
      --num-samples 2000 \
      --num-warmup 1000 \
      --num-chains 3 \
      --num-runs 3 \
      --percW 33 \
      --device gpu \
      --seed 42

    print("\n Basic qMAP-PD analysis completed!")
    print("Check ../results/qmap_pd/ for neuroimaging factor analysis results")

except Exception as e:
    print(f"\n qMAP-PD analysis failed: {e}")
    print("This might be due to missing qMAP-PD dataset")

In [None]:
# Cross-validation demonstration
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 5 \
  --num-samples 800 \
  --num-warmup 400 \
  --num-chains 2 \
  --run-cv \
  --cv-folds 5 \
  --device gpu \
  --seed 42

##Advanced Analysis

These experiments are computationally intense, including multiple advanced machine learning methods.

In [None]:
# qMAP-PD analysis with advanced preprocessing
proceed = input("This is a computationally intensive analysis. Proceed? (y/N): ")
if proceed.lower() in ['y', 'yes']:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 20 \
      --num-samples 3000 \
      --num-warmup 1500 \
      --num-chains 4 \
      --num-runs 5 \
      --percW 33 \
      --enable-preprocessing \
      --feature-selection statistical \
      --n-top-features 500 \
      --imputation-strategy knn \
      --optimize-preprocessing \
      --cross-validate-sources \
      --device gpu \
      --seed 42

    print("\n Advanced preprocessing analysis completed!")
    print(" Results include preprocessing optimization reports")
else:
    print("Skipping intensive preprocessing analysis.")

In [None]:
# Nested cross-validation
proceed = input("This is the most computationally intensive analysis. Proceed? (y/N): ")
if proceed.lower() in ['y', 'yes']:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 20 \
      --num-samples 2000 \
      --num-warmup 1000 \
      --num-chains 3 \
      --nested-cv \
      --cv-folds 5 \
      --enable-preprocessing \
      --feature-selection combined \
      --optimize-preprocessing \
      --create-factor-maps \
      --create-comprehensive-viz \
      --device gpu \
      --seed 42

    print("\n Nested cross-validation analysis completed!")
    print(" Complete validation results with hyperparameter optimization")
    print(" Factor-to-brain mapping results generated")
else:
    print("Skipping nested cross-validation analysis.")

In [None]:
# Full comprehensive pipeline
proceed = input("This is the complete analysis pipeline. Proceed? (y/N): ")
if proceed.lower() in ['y', 'yes']:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 25 \
      --num-samples 5000 \
      --num-warmup 2500 \
      --num-chains 4 \
      --num-runs 5 \
      --percW 33 \
      --enable-preprocessing \
      --feature-selection combined \
      --n-top-features 1000 \
      --imputation-strategy iterative \
      --optimize-preprocessing \
      --cross-validate-sources \
      --run-cv \
      --cv-folds 7 \
      --create-factor-maps \
      --create-comprehensive-viz \
      --device gpu \
      --seed 42

    print("\n Comprehensive analysis completed!")
    print(" Results generated")
else:
    print("Skipping comprehensive analysis.")

# Optimizing the Sparse GFA Pipeline 
### Training on qMAP-PD data

Modular format to optimize different pipeline elements using several GPUs in serial or in parallel.

In [None]:
# =============================================================================
# MODULE 0: QUICK VALIDATION (Run this first anywhere)
# =============================================================================

def run_quick_validation():
    print("MODULE 0: QUICK VALIDATION")
    print("-" * 40)
    
    print("Synthetic validation test...")
    !py310cuda run_analysis.py \
        --dataset synthetic \
        --K 3 \
        --num-samples 200 \
        --num-warmup 100 \
        --num-chains 1 \
        --num-runs 1 \
        --device gpu \
        --seed 42
    
    print("qMAP-PD basic test...")
    !py310cuda run_analysis.py \
        --dataset qmap_pd \
        --K 5 \
        --num-samples 300 \
        --num-warmup 150 \
        --num-chains 1 \
        --num-runs 1 \
        --device gpu \
        --seed 42
    
    print("Validation complete - pipeline is working!")

# Uncomment to run:
# run_quick_validation()

In [None]:
# =============================================================================
# MODULE 1: PREPROCESSING OPTIMIZATION
# Moderate computational load
# =============================================================================

def run_preprocessing_optimization():
    print("MODULE 1: PREPROCESSING OPTIMIZATION")
    print("-" * 40)
    
    # Test 1: Imputation strategies
    imputation_strategies = ['median', 'knn', 'iterative']
    
    print("Testing imputation strategies...")
    for strategy in imputation_strategies:
        print(f"   Testing {strategy} imputation...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K 10 \
            --num-samples 800 \
            --num-warmup 400 \
            --num-chains 1 \
            --num-runs 1 \
            --enable_preprocessing \
            --imputation_strategy $strategy \
            --feature_selection variance \
            --n_top_features 300 \
            --device gpu \
            --seed 42
    
    # Test 2: Feature selection methods
    feature_methods = ['variance', 'statistical', 'combined']
    
    print("\n Testing feature selection...")
    for method in feature_methods:
        print(f"   Testing {method} selection...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K 10 \
            --num-samples 800 \
            --num-warmup 400 \
            --num-chains 1 \
            --num-runs 1 \
            --enable_preprocessing \
            --imputation_strategy knn \
            --feature_selection $method \
            --n_top_features 300 \
            --device gpu \
            --seed 42
    
    # Test 3: Feature counts
    feature_counts = [200, 400, 600]
    
    print("\n Testing feature counts...")
    for n_features in feature_counts:
        print(f"   Testing {n_features} features...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K 10 \
            --num-samples 800 \
            --num-warmup 400 \
            --num-chains 1 \
            --num-runs 1 \
            --enable_preprocessing \
            --imputation_strategy knn \
            --feature_selection combined \
            --n_top_features $n_features \
            --device gpu \
            --seed 42
    
    print("\n MODULE 1 COMPLETE")
    print("Next: Compare results and choose best preprocessing")
    
    # Save recommendations file
    recommendations = {
        "module": "preprocessing_optimization",
        "completed": True,
        "next_steps": [
            "Compare CV scores across imputation strategies",
            "Check factor interpretability with different feature selection",
            "Choose optimal feature count based on performance",
            "Update BEST_* parameters in Module 2"
        ],
        "results_location": "../results/qmap_pd/",
        "key_metrics": ["mean CV R²", "factor loading interpretability", "convergence quality"]
    }
    
    with open("module1_results.json", "w") as f:
        json.dump(recommendations, f, indent=2)
    
    print("Results summary saved to: module1_results.json")

# Uncomment to run:
# run_preprocessing_optimization()

In [None]:
# =============================================================================
# MODULE 2: HYPERPARAMETER OPTIMIZATION
# Relatively intensive
# =============================================================================

def run_hyperparameter_optimization():
    print("MODULE 2: HYPERPARAMETER OPTIMIZATION")
    print("-" * 40)
    
    # Use best preprocessing from Module 1 (update these!)
    BEST_IMPUTATION = "knn"              # Update from Module 1
    BEST_FEATURE_SELECTION = "combined"  # Update from Module 1
    BEST_N_FEATURES = 400                # Update from Module 1
    
    # Test 1: Number of factors
    K_values = [10, 15, 20, 25]
    
    print("Testing number of factors...")
    for K in K_values:
        print(f"   Testing K={K}...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K $K \
            --num-samples 1500 \
            --num-warmup 750 \
            --num-chains 2 \
            --num-runs 2 \
            --enable_preprocessing \
            --imputation_strategy $BEST_IMPUTATION \
            --feature_selection $BEST_FEATURE_SELECTION \
            --n_top_features $BEST_N_FEATURES \
            --device gpu \
            --seed 42
    
    # Test 2: Sparsity levels
    sparsity_levels = [25, 33, 50]
    
    print("\n Testing sparsity levels...")
    for percW in sparsity_levels:
        print(f"   Testing {percW}% sparsity...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K 18 \
            --num-samples 1500 \
            --num-warmup 750 \
            --num-chains 2 \
            --num-runs 2 \
            --percW $percW \
            --enable_preprocessing \
            --imputation_strategy $BEST_IMPUTATION \
            --feature_selection $BEST_FEATURE_SELECTION \
            --n_top_features $BEST_N_FEATURES \
            --device gpu \
            --seed 42
    
    # Test 3: MCMC configurations
    mcmc_configs = [
        {"chains": 2, "samples": 2000},
        {"chains": 3, "samples": 1500}, 
        {"chains": 4, "samples": 1200}
    ]
    
    print("\n Testing MCMC configurations...")
    for config in mcmc_configs:
        chains = config["chains"]
        samples = config["samples"]
        print(f"   Testing {chains} chains × {samples} samples...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K 15 \
            --num-samples $samples \
            --num-warmup {samples//2} \
            --num-chains $chains \
            --num-runs 1 \
            --percW 33 \
            --enable_preprocessing \
            --imputation_strategy $BEST_IMPUTATION \
            --feature_selection $BEST_FEATURE_SELECTION \
            --n_top_features $BEST_N_FEATURES \
            --device gpu \
            --seed 42
    
    print("\n MODULE 2 COMPLETE")
    
    # Save recommendations
    recommendations = {
        "module": "hyperparameter_optimization", 
        "completed": True,
        "best_from_module1": {
            "imputation": BEST_IMPUTATION,
            "feature_selection": BEST_FEATURE_SELECTION,
            "n_features": BEST_N_FEATURES
        },
        "next_steps": [
            "Compare log-likelihood across K values",
            "Check factor stability across sparsity levels", 
            "Choose MCMC config with best convergence",
            "Update BEST_* parameters in Module 3"
        ]
    }
    
    with open("module2_results.json", "w") as f:
        json.dump(recommendations, f, indent=2)

# Uncomment to run:
# run_hyperparameter_optimization()

In [None]:
# =============================================================================
# MODULE 3: CROSS-VALIDATION
# =============================================================================

def run_cv_optimization():
    print("MODULE 3: CROSS-VALIDATION OPTIMIZATION")
    print("-" * 40)
    
    # Use best params from Modules 1&2 (update these!)
    BEST_IMPUTATION = "knn"        # From Module 1
    BEST_FEATURE_SELECTION = "combined"  # From Module 1
    BEST_N_FEATURES = 400          # From Module 1
    BEST_K = 15                    # From Module 2
    BEST_SPARSITY = 33             # From Module 2
    BEST_CHAINS = 3                # From Module 2
    BEST_SAMPLES = 1500            # From Module 2
    
    # Test 1: CV fold numbers
    cv_folds = [3, 5, 7]
    
    print("Testing CV fold numbers...")
    for folds in cv_folds:
        print(f"   Testing {folds}-fold CV...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K $BEST_K \
            --num-samples 1000 \
            --num-warmup 500 \
            --num-chains 2 \
            --cv_only \
            --cv_folds $folds \
            --enable_preprocessing \
            --imputation_strategy $BEST_IMPUTATION \
            --feature_selection $BEST_FEATURE_SELECTION \
            --n_top_features $BEST_N_FEATURES \
            --percW $BEST_SPARSITY \
            --device gpu \
            --seed 42
    
    # Test 2: CV strategies
    cv_types = ['standard', 'stratified']
    
    print("\n Testing CV strategies...")
    for cv_type in cv_types:
        print(f"   Testing {cv_type} CV...")
        
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K $BEST_K \
            --num-samples 1000 \
            --num-warmup 500 \
            --num-chains 2 \
            --cv_only \
            --cv_folds 5 \
            --cv_type $cv_type \
            --enable_preprocessing \
            --imputation_strategy $BEST_IMPUTATION \
            --feature_selection $BEST_FEATURE_SELECTION \
            --n_top_features $BEST_N_FEATURES \
            --percW $BEST_SPARSITY \
            --device gpu \
            --seed 42
    
    # Test 3: Nested CV (if you have time)
    print("\n Testing nested CV (optional - skip if time limited)...")
    proceed = input("Run nested CV? Adds computational demand (y/N): ")
    if proceed.lower() in ['y', 'yes']:
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K $BEST_K \
            --num-samples 1200 \
            --num-warmup 600 \
            --num-chains 2 \
            --nested_cv \
            --cv_folds 4 \
            --enable_preprocessing \
            --imputation_strategy $BEST_IMPUTATION \
            --feature_selection $BEST_FEATURE_SELECTION \
            --n_top_features $BEST_N_FEATURES \
            --device gpu \
            --seed 42
    
    print("\n MODULE 3 COMPLETE")
    
    # Save final recommendations
    final_config = {
        "module": "cv_optimization",
        "completed": True,
        "optimal_config": {
            "imputation": BEST_IMPUTATION,
            "feature_selection": BEST_FEATURE_SELECTION, 
            "n_features": BEST_N_FEATURES,
            "K": BEST_K,
            "sparsity": BEST_SPARSITY,
            "chains": BEST_CHAINS,
            "samples": BEST_SAMPLES,
            "cv_folds": 5,  # Update based on results
            "cv_type": "standard"  # Update based on results
        },
        "ready_for": "Module 4 - Final optimized analysis"
    }
    
    with open("optimal_config.json", "w") as f:
        json.dump(final_config, f, indent=2)

# Uncomment to run:
# run_cv_optimization()

In [None]:
# =============================================================================
# MODULE 4: FINAL OPTIMIZED ANALYSIS
# Most intensive module
# =============================================================================

def run_final_analysis():
    print("MODULE 4: FINAL OPTIMIZED ANALYSIS")
    print("-" * 40)
    
    # Load optimal config (update these from previous modules!)
    try:
        with open("optimal_config.json", "r") as f:
            config = json.load(f)["optimal_config"]
    except:
        print("No optimal_config.json found - using defaults")
        print("Run Modules 1-3 first for best results")
        config = {
            "imputation": "knn",
            "feature_selection": "combined",
            "n_features": 400,
            "K": 15,
            "sparsity": 33,
            "chains": 3,
            "samples": 2000,
            "cv_folds": 5,
            "cv_type": "standard"
        }
    
    print(f" Using configuration: {config}")
    
    print("\n Running final optimized analysis...")
    !py310cuda run_analysis.py \
        --dataset qmap_pd \
        --K {config["K"]} \
        --num-samples {config["samples"]} \
        --num-warmup {config["samples"]//2} \
        --num-chains {config["chains"]} \
        --num-runs 3 \
        --percW {config["sparsity"]} \
        --enable_preprocessing \
        --imputation_strategy {config["imputation"]} \
        --feature_selection {config["feature_selection"]} \
        --n_top_features {config["n_features"]} \
        --run_cv \
        --cv_folds {config["cv_folds"]} \
        --cv_type {config["cv_type"]} \
        --create_factor_maps \
        --create_comprehensive_viz \
        --device gpu \
        --seed 42
    
    print("\n MODULE 4 COMPLETE - Final analysis done!")

# Uncomment to run:
# run_final_analysis()

In [None]:
# =============================================================================
# MODULE 5: BRAIN MAPPING & INTERPRETABILITY
# =============================================================================

def run_brain_mapping():
    print(" MODULE 5: BRAIN MAPPING & INTERPRETABILITY")
    print("-" * 40)
    
    # Load optimal config
    try:
        with open("optimal_config.json", "r") as f:
            config = json.load(f)["optimal_config"]
    except:
        config = {
            "imputation": "knn",
            "feature_selection": "combined", 
            "n_features": 400,
            "K": 15,
            "sparsity": 33
        }
    
    print("Generating detailed brain factor maps...")
    !py310cuda run_analysis.py \
        --dataset qmap_pd \
        --roi_views \
        --K {config["K"]} \
        --num-samples 1500 \
        --num-warmup 750 \
        --num-chains 2 \
        --num-runs 2 \
        --percW {config["sparsity"]} \
        --enable_preprocessing \
        --imputation_strategy {config["imputation"]} \
        --feature_selection {config["feature_selection"]} \
        --n_top_features {config["n_features"]} \
        --create_factor_maps \
        --device gpu \
        --seed 42
    
    print("\n Testing interpretability across factor counts...")
    for K in [10, 15, 20]:
        print(f"   Analyzing {K} factors...")
        !py310cuda run_analysis.py \
            --dataset qmap_pd \
            --K $K \
            --num-samples 1000 \
            --num-warmup 500 \
            --num-chains 2 \
            --num-runs 1 \
            --enable_preprocessing \
            --imputation_strategy {config["imputation"]} \
            --feature_selection {config["feature_selection"]} \
            --n_top_features {config["n_features"]} \
            --device gpu \
            --seed 42
    
    print("\n MODULE 5 COMPLETE - Brain mapping done!")

# Uncomment to run:
# run_brain_mapping()

In [None]:
# =============================================================================
# STANDALONE MODULES FOR SPECIFIC TESTS
# =============================================================================

def run_roi_comparison():
    print("ROI STRATEGY COMPARISON")
    print("-" * 40)
    
    base_params = [
        "--dataset", "qmap_pd",
        "--K", "15",
        "--num-samples", "1200",
        "--num-warmup", "600", 
        "--num-chains", "2",
        "--num-runs", "1",
        "--enable_preprocessing",
        "--device", "gpu",
        "--seed", "42"
    ]
    
    print("Testing concatenated ROIs...")
    !py310cuda run_analysis.py {" ".join(base_params)}
    
    print("Testing separate ROIs...")
    !py310cuda run_analysis.py --roi_views {" ".join(base_params)}
    
    print("ROI comparison complete")

def run_quick_synthetic_demo():
    print("QUICK SYNTHETIC DEMO")
    print("-" * 40)
    
    !py310cuda run_analysis.py \
        --dataset synthetic \
        --K 5 \
        --num-samples 800 \
        --num-warmup 400 \
        --num-chains 2 \
        --num-runs 1 \
        --percW 33 \
        --device gpu \
        --seed 42
    
    print("Synthetic demo complete")

In [None]:
# =============================================================================
# USAGE INSTRUCTIONS
# =============================================================================

print("""
MODULAR OPTIMIZATION USAGE:

COLAB USAGE:
   1. run_quick_validation()
   2. run_preprocessing_optimization()
   3. run_cv_optimization()
   5. run_brain_mapping()

UCL GPU USAGE:
   2. run_hyperparameter_optimization() # 30 mins - intensive
   4. run_final_analysis()              # 30 mins - most intensive

STANDALONE MODULES:
   • run_roi_comparison()      # 15 mins
   • run_quick_synthetic_demo() # 5 mins

WORKFLOW:
   1. Start with validation anywhere
   2. Run preprocessing on Colab
   3. Run hyperparameters on UCL GPUs
   4. Run CV anywhere  
   5. Run final analysis on UCL GPUs
   6. Run brain mapping anywhere

SHARING RESULTS:
   • Each module saves config to JSON files
   • Copy optimal_config.json between platforms
   • Results persist in ../results/ folders
""")

In [None]:
# =============================================================================
# PLATFORM-SPECIFIC RUNNERS
# =============================================================================

def colab_session():
    print("COLAB SESSION")
    print("=" * 40)
    run_quick_validation()
    run_preprocessing_optimization() 
    run_cv_optimization()
    print("\n Colab session complete!")
    print(" Download optimal_config.json for SSH GPUs")

def ssh_session():
    print("HEAVY-DUTY GPU SESSION")
    print("=" * 40)
    
    # Check if we have optimal config from Colab
    if os.path.exists("optimal_config.json"):
        print("Found optimal_config.json from previous session")
    else:
        print("No optimal_config.json - running quick optimization first")
        run_preprocessing_optimization()
    
    run_hyperparameter_optimization()
    run_final_analysis()
    print("\n SSH session complete!")

# Uncomment for platform-specific sessions:
# colab_session()      # For Colab
# ssh_session()        # For UCL GPUs
