# Run LSTM aging model on Google Colab (GPU)

This notebook runs the **Ageing in the Lung** LSTM model in Colab with free GPU.

**Important:** Before running anything:
1. **Runtime → Change runtime type → Hardware accelerator → GPU** (e.g. T4) → Save.  
   If you see "Device: cpu", the runtime is not set to GPU — change it and re-run from the top.
2. Run cells in order.
3. Upload `discovery_combined.h5ad` via the upload cell (large files can break if dragged; use the file picker).

## 1. Clone repo and install dependencies

In [2]:
# Detect Colab vs local run; set repo root
import os
try:
    import google.colab
    IN_COLAB = True
    REPO_ROOT = "/content/msl_aging_pipeline"
except ImportError:
    IN_COLAB = False
    cwd = os.getcwd()
    REPO_ROOT = os.path.dirname(cwd) if os.path.basename(cwd) == "notebooks" else cwd
print("Colab:", IN_COLAB, "| REPO_ROOT:", REPO_ROOT)

Colab: False | REPO_ROOT: /home/melhajjar/ipython_notebooks/manuscript1_ageing_inthelung/msl_aging_pipeline


In [1]:
if IN_COLAB:
    !git clone https://github.com/mikalelh/manuscript1-Lung-Ageing-ML-Pipeline.git {REPO_ROOT} 2>/dev/null || (cd {REPO_ROOT} && git pull)
%cd {REPO_ROOT}

/bin/bash: line 0: cd: /content/msl_aging_pipeline: No such file or directory
[Errno 2] No such file or directory: '/content/msl_aging_pipeline'
/home/melhajjar/ipython_notebooks/manuscript1_ageing_inthelung/msl_aging_pipeline/notebooks


In [None]:
!pip3 install -q scanpy anndata pandas scikit-learn tqdm matplotlib
import torch
cuda_ok = torch.cuda.is_available()
print('CUDA available:', cuda_ok)
if cuda_ok:
    print('Device:', torch.cuda.get_device_name(0))
else:
    print('WARNING: No GPU detected. For faster training: Runtime → Change runtime type → Hardware accelerator → GPU → Save, then re-run from the top.')

## 2. Add data: upload or Google Drive

In [None]:
# Option A: Upload discovery_combined.h5ad (Colab only; run this cell, then use the file picker)
import os
import shutil
data_dir = os.path.join(REPO_ROOT, "data")
os.makedirs(data_dir, exist_ok=True)
if IN_COLAB:
    from google.colab import files
    uploaded = files.upload()
    for name in uploaded:
        shutil.move(name, os.path.join(data_dir, "discovery_combined.h5ad"))
    print("Saved to data/discovery_combined.h5ad")
else:
    print("Local run: put discovery_combined.h5ad in", data_dir)

In [None]:
# Use .h5ad from repo root or data/ — ensure it's in data/ so the script finds it
import os
import shutil
data_dir = os.path.join(REPO_ROOT, "data")
os.makedirs(data_dir, exist_ok=True)
target = os.path.join(data_dir, "discovery_combined.h5ad")
in_root = os.path.join(REPO_ROOT, "discovery_combined.h5ad")
if os.path.isfile(in_root):
    shutil.copy2(in_root, target)
    print("Copied discovery_combined.h5ad from repo root → data/")
if os.path.isfile(target):
    print("Ready: data/discovery_combined.h5ad")
else:
    print("Put discovery_combined.h5ad in the repo root (or in data/), then re-run this cell.")

In [None]:
# Option B: Use a file already on Google Drive (uncomment and set path)
# from google.colab import drive
# drive.mount('/content/drive')
# import shutil
# shutil.copy('/content/drive/MyDrive/path/to/discovery_combined.h5ad', '/content/msl_aging_pipeline/data/discovery_combined.h5ad')
# print('Copied from Drive')

In [None]:
# Validate discovery h5ad before running LSTM (avoids "file signature not found" later)
import os
data_dir = os.path.join(REPO_ROOT, "data")
path = os.path.join(data_dir, "discovery_combined.h5ad")
if not os.path.isfile(path):
    print("ERROR: File not found:", path)
    print("Upload discovery_combined.h5ad (Option A cell) or copy from repo root/Drive, then re-run the data cells.")
else:
    size_mb = os.path.getsize(path) / (1024 * 1024)
    print(f"File size: {size_mb:.1f} MB")
    try:
        import scanpy as sc
        adata = sc.read_h5ad(path)
        print("OK: Valid h5ad. Cells:", adata.n_obs, "| Genes:", adata.n_vars)
    except OSError as e:
        if "file signature not found" in str(e) or "Unable to synchronously open" in str(e):
            print("ERROR: File is corrupted or incomplete (not a valid HDF5/h5ad).")
            print("Re-upload the full discovery_combined.h5ad using the upload cell (file picker), then re-run this cell.")
        else:
            raise

## 3. Run LSTM training

In [None]:
%cd {REPO_ROOT}
!python models/lstm_aging_model.py

## 4. Download results (optional)

In [None]:
import os
!cd {REPO_ROOT} && zip -r /tmp/lstm_results.zip results/ figures/ 2>/dev/null
if IN_COLAB:
    from google.colab import files
    files.download("/tmp/lstm_results.zip")
else:
    print("Results in", REPO_ROOT, "- results/ and figures/")