#Environment Set Up

Creates a Python 3.10 kernel within Colab, clones the repository and installs pinned dependencies. Installs compatible versions of JAX (CUDA) and NumPyro and CUDA JAX and NVIDIA CUDA wheels.
Creates a py310cuda launcher fixed on the correct libraries.

##Directory
Clones the GitHub repository and mounts to Google Drive to store results.

In [15]:
import os, subprocess, sys, stat, textwrap, pathlib, shutil

REPO = "/content/sgfa_qmap-pd"

In [16]:
# Clone the repository
os.chdir("/content")
subprocess.run(["rm","-rf","sgfa_qmap-pd"])
subprocess.check_call(["git","clone","https://github.com/meeramads/sgfa_qmap-pd.git"])
os.chdir(REPO)

In [None]:
from google.colab import drive
from pathlib import Path
drive.mount('/content/drive')

In [None]:
drive_path = "/content/drive/MyDrive/sgfa_results"

if os.path.exists(drive_path):
    if os.path.isfile(drive_path):
        print(f"Found a FILE named 'sgfa_results' - renaming it to 'sgfa_results_backup'")
        shutil.move(drive_path, "/content/drive/MyDrive/sgfa_results_backup")
        os.makedirs(drive_path)
    elif os.path.isdir(drive_path):
        print(f"Found existing DIRECTORY 'sgfa_results' - checking contents...")
        contents = list(os.listdir(drive_path))
        if contents:
            print(f"   Contains {len(contents)} items: {contents[:5]}{'...' if len(contents) > 5 else ''}")
            print("   Will preserve existing results and add new ones")
        else:
            print("   Directory is empty - ready to use")
else:
    print("Creating new 'sgfa_results' directory")
    os.makedirs(drive_path)

In [None]:
results_link = "../results"

if os.path.exists(results_link):
    if os.path.islink(results_link):
        print("Removing existing symlink")
        os.unlink(results_link)
    elif os.path.isdir(results_link):
        print("Found existing results directory - backing up first")
        if os.listdir(results_link):
            backup_path = "/content/drive/MyDrive/sgfa_results_local_backup"
            shutil.move(results_link, backup_path)
            print(f"   Moved to {backup_path}")
        else:
            shutil.rmtree(results_link)

In [None]:
print("Creating symlink to Google Drive...")
os.symlink(drive_path, results_link)

In [None]:
print("Setup complete! Testing...")
test_dir = f"{results_link}/test_directory"
os.makedirs(test_dir, exist_ok=True)

In [22]:
with open(f"{test_dir}/test_file.txt", "w") as f:
    f.write("This file should persist across disconnections!")

In [None]:
if os.path.exists(f"{drive_path}/test_directory/test_file.txt"):
    print("SUCCESS: Files will now persist across disconnections!")
    print(f"Results location: {drive_path}")
    print(f"Symlink: {results_link} -> {drive_path}")
else:
    print("ERROR: Symlink setup failed")

In [None]:
shutil.rmtree(test_dir)
print("Cleaned up test files")

##Python and CUDA environment

Must be connected to a GPU runtime. Sets up the Python 3.10 environment, JAX and CUDA libraries to coordinate with GPU usage. **Has to be run before any experiments.**

In [None]:
# Install Python 3.10 side-by-side
subprocess.check_call(["wget","-q","https://github.com/korakot/kora/releases/download/v0.10/py310.sh"])
subprocess.check_call(["bash","./py310.sh","-b","-f","-p","/usr/local"])
subprocess.check_call(["python3.10","-V"])

In [None]:
# Install dependencies
subprocess.check_call(["python3.10","-m","pip","install","-U","pip"])
subprocess.check_call(["python3.10","-m","pip","install","-r","requirements.txt"])

In [None]:
# Set up JAX (CUDA) + NumPyro to ensure compatibility
subprocess.run(["python3.10","-m","pip","uninstall","-y","jax","jaxlib"])
subprocess.check_call([
    "python3.10","-m","pip","install","-U",
    "jax[cuda12_pip]==0.4.20","-f","https://storage.googleapis.com/jax-releases/jax_cuda_releases.html"
])
subprocess.check_call(["python3.10","-m","pip","install","numpyro==0.13.2"])

In [None]:
# NVIDIA CUDA libraries into the *py310* site-packages
subprocess.check_call(["python3.10","-m","pip","install","-q",
    "nvidia-cudnn-cu12>=8.9,<9",
    "nvidia-cublas-cu12>=12.2",
    "nvidia-cuda-runtime-cu12>=12.2",
    "nvidia-cusolver-cu12>=11.4",
    "nvidia-cusparse-cu12>=12.1",
    "nvidia-cufft-cu12>=11.0",
    "nvidia-cuda-cupti-cu12>=12.2",
    "nvidia-nvjitlink-cu12>=12.2",
    "nvidia-nccl-cu12>=2.18",
])

In [29]:
# Build LD_LIBRARY_PATH for those wheels and write a launcher
py310_site = subprocess.check_output(
    ["python3.10","-c","import site; print(site.getsitepackages()[0])"],
    text=True
).strip()
subdirs = ["cudnn/lib","cublas/lib","cufft/lib","cusolver/lib","cusparse/lib",
           "cuda_runtime/lib","cuda_cupti/lib","nvjitlink/lib","nccl/lib"]
lib_paths = [os.path.join(py310_site,"nvidia",d) for d in subdirs]
lib_paths = [p for p in lib_paths if os.path.isdir(p)]
LD = ":".join(lib_paths)

wrapper = "/usr/local/bin/py310cuda"
pathlib.Path(wrapper).write_text(textwrap.dedent(f"""\
#!/bin/bash
export LD_LIBRARY_PATH="{LD}:$LD_LIBRARY_PATH"
export XLA_PYTHON_CLIENT_PREALLOCATE=false
export XLA_PYTHON_CLIENT_MEM_FRACTION=0.70
export JAX_PLATFORM_NAME=gpu
exec python3.10 "$@"
"""))
os.chmod(wrapper, os.stat(wrapper).st_mode | stat.S_IEXEC)

In [None]:
# Verify JAX sees the GPU
subprocess.check_call([
    "py310cuda","-c",
    "import jax; print('backend:', jax.lib.xla_bridge.get_backend().platform, '| devices:', jax.devices())"
])

In [None]:
# Double check
!py310cuda -c "import jax; print('backend:', jax.lib.xla_bridge.get_backend().platform, '| devices:', jax.devices())"

---

# Training the model

Call  ```!py310cuda run_analysis.py``` with the flag ```--device gpu```.

Run ```!py310cuda run_analysis.py --help || py310cuda run_analysis.py -h``` for detailed information on other available flags.

In [None]:
!git pull

In [None]:
subprocess.run(["rm","-rf","results"])

In [None]:
!ls

In [None]:
# Lightweight smoke test for synthetic data
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 3 \
  --num-samples 100 \
  --num-warmup 50 \
  --num-chains 1 \
  --num-runs 1 \
  --percW 33 \
  --seed 42 \
  --device gpu

In [None]:
# Preprocessing smoke test
!py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 3 \
      --num-samples 100 \
      --num-warmup 50 \
      --num-chains 1 \
      --num-runs 1 \
      --enable_preprocessing \
      --feature_selection variance \
      --n_top_features 100 \
      --device gpu \
      --seed 42

In [None]:
# Cross-validation smoke test
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 3 \
  --num-samples 50 \
  --num-warmup 25 \
  --num-chains 1 \
  --cv_only \
  --cv_folds 3 \
  --device gpu \
  --seed 42

In [None]:
!rm -rf ../results/qmap_pd/sparseGFA_K3_1chs_pW33_s100_reghsZ/

In [None]:
# Lightweight smoke test for qMAP-PD data
!py310cuda run_analysis.py \
  --dataset qmap_pd \
  --K 3 \
  --num-samples 100 \
  --num-warmup 50 \
  --num-chains 1 \
  --num-runs 1 \
  --device gpu

In [None]:
# Factor recovery demonstration
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 5 \
  --num-samples 1000 \
  --num-warmup 500 \
  --num-chains 2 \
  --num-runs 3 \
  --percW 33 \
  --device gpu \
  --seed 42

In [None]:
# Sparsity level comparison
sparsity_levels = [25, 50, 75]
results_summary = []

for percW in sparsity_levels:
    print(f"\n Running with {percW}% sparsity...")

    result = subprocess.run([
        "py310cuda", "run_analysis.py",
        "--dataset", "synthetic",
        "--K", "5",
        "--num-samples", "800",
        "--num-warmup", "400",
        "--num-chains", "2",
        "--num-runs", "2",
        "--percW", str(percW),
        "--device", "gpu",
        "--seed", "42"
    ], capture_output=True, text=True)

    if result.returncode == 0:
        print(f"{percW}% sparsity completed successfully")
        results_summary.append(f"{percW}% sparsity: Success")
    else:
        print(f"{percW}% sparsity failed")
        results_summary.append(f"{percW}% sparsity: Failed")

print("\n Sparsity Comparison Summary:")
for result in results_summary:
    print(f"   {result}")
print(" Check ../results/synthetic/ folders for comparison plots")


In [None]:
# Basic qMAP-PD analysis
try:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 15 \
      --num-samples 2000 \
      --num-warmup 1000 \
      --num-chains 3 \
      --num-runs 3 \
      --percW 33 \
      --device gpu \
      --seed 42

    print("\n Basic qMAP-PD analysis completed!")
    print("Check ../results/qmap_pd/ for neuroimaging factor analysis results")

except Exception as e:
    print(f"\n qMAP-PD analysis failed: {e}")
    print("This might be due to missing qMAP-PD dataset")

In [None]:
# Cross-validation demonstration
!py310cuda run_analysis.py \
  --dataset synthetic \
  --K 5 \
  --num-samples 800 \
  --num-warmup 400 \
  --num-chains 2 \
  --run-cv \
  --cv-folds 5 \
  --device gpu \
  --seed 42

##Advanced Analysis

These experiments are computationally intense, including multiple advanced machine learning methods.

In [None]:
# qMAP-PD analysis with advanced preprocessing
proceed = input("This is a computationally intensive analysis. Proceed? (y/N): ")
if proceed.lower() in ['y', 'yes']:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 20 \
      --num-samples 3000 \
      --num-warmup 1500 \
      --num-chains 4 \
      --num-runs 5 \
      --percW 33 \
      --enable-preprocessing \
      --feature-selection statistical \
      --n-top-features 500 \
      --imputation-strategy knn \
      --optimize-preprocessing \
      --cross-validate-sources \
      --device gpu \
      --seed 42

    print("\n Advanced preprocessing analysis completed!")
    print(" Results include preprocessing optimization reports")
else:
    print("Skipping intensive preprocessing analysis.")

In [None]:
# Nested cross-validation
proceed = input("This is the most computationally intensive analysis. Proceed? (y/N): ")
if proceed.lower() in ['y', 'yes']:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 20 \
      --num-samples 2000 \
      --num-warmup 1000 \
      --num-chains 3 \
      --nested-cv \
      --cv-folds 5 \
      --enable-preprocessing \
      --feature-selection combined \
      --optimize-preprocessing \
      --create-factor-maps \
      --create-comprehensive-viz \
      --device gpu \
      --seed 42

    print("\n Nested cross-validation analysis completed!")
    print(" Complete validation results with hyperparameter optimization")
    print(" Factor-to-brain mapping results generated")
else:
    print("Skipping nested cross-validation analysis.")

In [None]:
# Full comprehensive pipeline
proceed = input("This is the complete analysis pipeline. Proceed? (y/N): ")
if proceed.lower() in ['y', 'yes']:
    !py310cuda run_analysis.py \
      --dataset qmap_pd \
      --K 25 \
      --num-samples 5000 \
      --num-warmup 2500 \
      --num-chains 4 \
      --num-runs 5 \
      --percW 33 \
      --enable-preprocessing \
      --feature-selection combined \
      --n-top-features 1000 \
      --imputation-strategy iterative \
      --optimize-preprocessing \
      --cross-validate-sources \
      --run-cv \
      --cv-folds 7 \
      --create-factor-maps \
      --create-comprehensive-viz \
      --device gpu \
      --seed 42

    print("\n Comprehensive analysis completed!")
    print(" Results generated")
else:
    print("Skipping comprehensive analysis.")