# Paper Reproduction Launcher (LED only)

This Colab-friendly notebook drives the existing CLI scripts in `summarization/` to run very small LED and Llama jobs. It uses the `train_last_100.json` / `valid_last_100.json` splits so we can exercise the full code paths quickly before launching the long runs described in the paper.

In [1]:
# @title Sync the local repository from Google Drive (no git clone)
from pathlib import Path
import os
import importlib.util

COLAB = importlib.util.find_spec("google.colab") is not None
if COLAB:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive', force_remount=True)
    REPO_IN_DRIVE = Path('/content/drive/Othercomputers/My Mac/patient_summaries_with_llms/')  # @param {type:"string"}
    if not REPO_IN_DRIVE.exists():
        raise FileNotFoundError(f"Upload/sync the local repo to {REPO_IN_DRIVE} first.")
    TARGET_DIR = Path('/content/patient_summaries_with_llms')
    TARGET_DIR.mkdir(parents=True, exist_ok=True)
    os.system(f"rsync --progress -a --delete '{REPO_IN_DRIVE}/summarization' '{TARGET_DIR}/'")
    os.system(f"rsync --progress -a --delete '{REPO_IN_DRIVE}/data' '{TARGET_DIR}/'")
    os.system(f"rsync --progress -a --delete '{REPO_IN_DRIVE}/requirements.txt' '{TARGET_DIR}/'")
    os.system(f"rsync --progress -a --delete '{REPO_IN_DRIVE}/requirements-llama.txt' '{TARGET_DIR}/'")
    %cd /content/patient_summaries_with_llms
else:
    print("Running outside Colab; using the current working directory.")
    TARGET_DIR = Path.cwd()

Mounted at /content/drive
/content/patient_summaries_with_llms


In [2]:
# @title Install base dependencies shared by LED + evaluation
%pip install -q -r requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.6/766.6 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m133.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

## LED smoke run (`summarization/run_summarization.py`)

Trains/evaluates `allenai/led-base-16384` on the last 100 training examples for one epoch to ensure that tokenization, dataloading, and Trainer hooks work.

In [3]:
# # @title LED-base on train_last_100 / valid_last_100
import os
from pathlib import Path

os.environ["WANDB_MODE"] = "offline"
DATA_DIR = Path("data/ann-pt-summ/1.0.1/mimic-iv-note-ext-di-bhc/dataset")

assert DATA_DIR.exists(), f"Missing dataset folder: {DATA_DIR}"

    # --do_train --do_eval --do_predict \
!python summarization/run_summarization.py \
    --model_name_or_path data/led_4000_600_chars/ \
    --test_file {DATA_DIR / 'test_4000_600_chars_last_100.json'} \
    --text_column text \
    --summary_column summary \
    --output_dir results/led_full_run_predict \
    --do_predict \
    --predict_with_generate \
    --per_device_eval_batch_size 1 \
    --max_source_length 4096 \
    --max_target_length 350 \
    --report_to none

2025-12-06 10:32:44.776377: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765017164.799561    4201 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765017164.806465    4201 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765017164.824469    4201 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765017164.824513    4201 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765017164.824516    4201 computation_placer.cc:177] computation placer alr