diff --git a/.dockerignore b/.dockerignore index a17ab263..fc992f14 100644 --- a/.dockerignore +++ b/.dockerignore @@ -70,3 +70,5 @@ exp/ templates/ scripts/ README.md +.claude/ +.pr-body.md diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile new file mode 100644 index 00000000..94e6fc86 --- /dev/null +++ b/.github/runners/Dockerfile @@ -0,0 +1,51 @@ +# syntax=docker/dockerfile:1 +# +# FlowMesh self-hosted GitHub Actions runner image. +# +# Bases on myoung34/github-runner with the uv wheel cache pre-populated +# for the project's --all-extras dep set. CI workflows that run +# ``uv sync --all-extras --frozen`` warm-hit this cache instead of +# re-downloading every wheel on each run. +# +# Build context MUST be the repository root (FlowMesh is a private repo; +# we COPY from the build context rather than git-cloning at build time +# to avoid auth at build time): +# +# cd +# git checkout main && git pull +# docker build \ +# --build-arg UV_VERSION=0.11.8 \ +# -t flowmesh-oss-ci-runner:0.11.8-$(date +%Y%m%d) \ +# -t flowmesh-oss-ci-runner:latest \ +# -f .github/runners/Dockerfile \ +# . +# +# Refresh cadence: +# - uv.lock changed on main → rebuild to refresh the cache. +# - UV_VERSION bumped → rebuild AND update setup-uv's ``version:`` +# input in the workflows to match (see .github/runners/README.md). +# +# A stale cache is slower, not unsafe — uv falls through to network +# for any wheel not already cached in the image. +FROM myoung34/github-runner:latest + +# Pinned uv version. Workflows MUST pin setup-uv@v7's ``version:`` +# input to the same value or the cache layout written here may not +# match what CI reads back. +ARG UV_VERSION=0.11.8 +ENV UV_CACHE_DIR=/root/.cache/uv + +# Install pinned uv into a shared location. +RUN curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" \ + | env UV_INSTALL_DIR=/usr/local/bin sh \ + && uv --version + +# Pre-populate the uv cache by sync'ing the project's full --all-extras +# dep set against the FlowMesh source in the build context. Only the +# resulting $UV_CACHE_DIR survives in the final image — the source tree +# and the temporary venv are deleted. +COPY . /tmp/flowmesh +RUN cd /tmp/flowmesh \ + && uv sync --all-extras --frozen \ + && cd / \ + && rm -rf /tmp/flowmesh diff --git a/.github/runners/README.md b/.github/runners/README.md new file mode 100644 index 00000000..e2612e6c --- /dev/null +++ b/.github/runners/README.md @@ -0,0 +1,65 @@ +# FlowMesh self-hosted runner image + +`Dockerfile` here builds a custom GitHub Actions runner image based on `myoung34/github-runner` with the uv wheel cache pre-populated for the project's `--all-extras` install. CI workflows that run `uv sync --all-extras --frozen` warm-hit this cache instead of redownloading every wheel. + +## Build + +Build context is the repo root, so run from the top of your FlowMesh checkout: + +```bash +git checkout main && git pull +docker build \ + --build-arg UV_VERSION=0.11.8 \ + -t flowmesh-oss-ci-runner:0.11.8-$(date +%Y%m%d) \ + -t flowmesh-oss-ci-runner:latest \ + -f .github/runners/Dockerfile \ + . +``` + +The two tags give a date-stamped reference for rollback plus a moving `:latest` for the runner systemd units. Build it on a `linux/amd64` host so the cached wheels are platform-correct for the runners (which are also `linux/amd64`). + +## Deploy to a runner host + +Update the runner systemd unit's `docker run` command to use the new image. Where you currently have: + +``` +ExecStart=/usr/bin/docker run --rm \ + ... \ + myoung34/github-runner:latest +``` + +change the image reference: + +``` +ExecStart=/usr/bin/docker run --rm \ + ... \ + flowmesh-oss-ci-runner:latest +``` + +Reload + restart the units: + +```bash +sudo systemctl daemon-reload +sudo systemctl restart 'flowmesh-oss-ci-cuda-runner@*.service' +sudo systemctl restart 'flowmesh-oss-ci-gpu-runner@*.service' +``` + +## Refresh cadence + +Rebuild the image when: + +- **`uv.lock` changed on `main`.** New / updated wheels won't be in the image; CI will fall through to network for those. Not unsafe, just slower until the rebuild. +- **`UV_VERSION` bumped.** uv's cache layout changes across minor versions; a mismatch between the image-time uv and CI-time uv (set by `setup-uv@v7`'s `version:` input) can leave the cache unused. After bumping, update both `--build-arg UV_VERSION=...` here AND every `setup-uv@v7` invocation in `.github/workflows/*.yml` to match. + +A stale cache is never unsafe — uv falls through to network for any wheel not in the image. The cost of staleness is install time, not correctness. + +## Workflow side + +Every `setup-uv@v7` invocation in CI must pin its `version:` to the same `UV_VERSION` the image was built with, and should keep `enable-cache: false` (the runner image already supplies the cache; GHA cache would override it): + +```yaml +- uses: astral-sh/setup-uv@94527f2e458b27549849d47d273a16bec83a01e9 # v7 + with: + version: "0.11.8" + enable-cache: false +```