Skip to content

Commit

Permalink
Merge pull request #841 from nextstrain/separate-indexer
Browse files Browse the repository at this point in the history
Separate indexer
  • Loading branch information
joverlee521 committed May 16, 2024
2 parents d532048 + 24cd641 commit 359f0db
Show file tree
Hide file tree
Showing 11 changed files with 240 additions and 15 deletions.
20 changes: 18 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,32 @@ jobs:
name: test-logs
path: test/server.log

deploy:
index-resources:
if: |2
github.repository == 'nextstrain/nextstrain.org'
&& github.event_name == 'push'
&& github.ref == 'refs/heads/master'
needs: test
permissions:
id-token: write # needed to interact with GitHub's OIDC Token endpoint
contents: read
uses: ./.github/workflows/index-resources.yml

deploy:
if: |2
!cancelled()
&& needs.build.result == 'success'
&& needs.test.result == 'success'
&& contains(fromJSON('["success", "skipped"]'), needs.index-resources.result)
&& github.repository == 'nextstrain/nextstrain.org'
&& ( (github.event_name == 'push' && github.ref == 'refs/heads/master')
|| (github.event_name == 'workflow_dispatch' && inputs.heroku-app) )
# Wait for "build" and "test" jobs above to pass.
# Wait for "build", "test", and "index-resources" jobs above to pass.
needs:
- build
- test
- index-resources

# Only one "deploy" job per Heroku app at a time.
concurrency: deploy:${{ inputs.heroku-app || 'nextstrain-canary' }}
Expand Down
42 changes: 41 additions & 1 deletion .github/workflows/index-resources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,45 @@ on:
# Manually triggered using GitHub's UI
workflow_dispatch:

workflow_call:


defaults:
run:
# This is the same as GitHub Action's `bash` keyword as of 20 June 2023:
# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell
#
# Completely spelling it out here so that GitHub can't change it out from under us
# and we don't have to refer to the docs to know the expected behavior.
shell: bash --noprofile --norc -eo pipefail {0}

jobs:
build-ref-matrix:
runs-on: ubuntu-latest
outputs:
ref-matrix: ${{ steps.ref-matrix.outputs.ref-matrix }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version-file: 'package.json'
- id: ref-matrix
name: Create ref matrix
env:
HEROKU_TOKEN: ${{ secrets.HEROKU_TOKEN_READ_PROTECTED }}
run: |
echo "ref-matrix=$(./scripts/get-resource-index-ref-matrix)" | tee -a "$GITHUB_OUTPUT"
rebuild-index:
needs: [build-ref-matrix]
strategy:
matrix:
include: ${{ fromJson(needs.build-ref-matrix.outputs.ref-matrix) }}
# Only allow one run of the job per resource index
concurrency:
group: ${{ github.workflow }}-${{ matrix.resource_index }}
env:
RESOURCE_INDEX: ${{ matrix.resource_index }}
runs-on: ubuntu-latest
permissions:
id-token: write # needed to interact with GitHub's OIDC Token endpoint
Expand All @@ -21,6 +58,8 @@ jobs:
shell: bash
steps:
- uses: actions/checkout@v4
with:
ref: ${{ matrix.ref }}
- uses: actions/setup-node@v4
with:
node-version-file: 'package.json'
Expand All @@ -35,5 +74,6 @@ jobs:
--gzip --output resources.json.gz \
--resourceTypes dataset --collections core staging
- name: Upload the new index, overwriting the existing index
if: ${{ startsWith(env.RESOURCE_INDEX, 's3://') }}
run: |
aws s3 cp resources.json.gz s3://nextstrain-inventories/resources.json.gz
aws s3 cp resources.json.gz "$RESOURCE_INDEX"
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@
"s3:GetObject"
],
"Resource": [
"arn:aws:s3:::nextstrain-inventories/resources.json.gz"
"arn:aws:s3:::nextstrain-inventories/resources.json.gz",
"arn:aws:s3:::nextstrain-inventories/resources/*.json.gz"
]
}
]
Expand Down
3 changes: 2 additions & 1 deletion aws/iam/policy/NextstrainDotOrgServerInstance.tftpl.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
"s3:GetObject"
],
"Resource": [
"arn:aws:s3:::nextstrain-inventories/resources.json.gz"
"arn:aws:s3:::nextstrain-inventories/resources.json.gz",
"arn:aws:s3:::nextstrain-inventories/resources/*.json.gz"
]
}
]
Expand Down
47 changes: 39 additions & 8 deletions docs/resource-collection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,42 @@ resources (and their versions) to the user.
The index location is set by the env/config variable ``RESOURCE_INDEX``. The
``RESOURCE_INDEX`` must be either a "s3://" address or a local file path. If the
file is located on S3 it must be gzipped. The server loads this index at start
time and refreshes it hourly. The nextstrain.org testing & production configs
currently set this to ``s3://nextstrain-inventories/resources.json.gz``.
time and refreshes it hourly. Resource collections can be ignored by the server
by setting the env variable ``RESOURCE_INDEX="false"`` or by omitting it from
your configuration JSON.

Resource collections can be ignored by the server by setting the env variable
``RESOURCE_INDEX="false"`` or by omitting it from your configuration JSON.

Resource index revisions
========================

The nextstrain.org testing & production configs currently set this to
``s3://nextstrain-inventories/resources/v<revision_number>.json.gz``.

If you make any updates that changes the structure or the contents of the resource
index JSON, then bump the ``<revision_number>`` within the configs (``env/production/config.json``)
so that any uploads to S3 does not disrupt the production server.

These updates include changes to:

* any scripts within ``resourceIndexer/*``
* ``data/manifest_core.json``
* ``convertManifestJsonToAvailableDatasetList`` in ``src/endpoints/charon/parseManifest.js``
* ``datasetRedirectPatterns`` in ``src/redirects.js``

If you are ever unsure, it's better to just bump the revision number!

Testing new revisions
---------------------

The handling of new revision numbers in Heroku review apps is currently still
handled manually.

Once you've pushed up the changes to the revision number in the config, run
the `index-resource.yml workflow <https://github.com/nextstrain/nextstrain.org/actions/workflows/index-resources.yml>__`
using your branch.

After the new index resources JSON has been uploaded to S3, then open the PR for
your branch and the Heroku review app should use the new revision.


Local index generation
Expand All @@ -36,7 +67,7 @@ This will create ``./devData/core.manifest.json`` and
``./devData/core.inventory.csv.gz`` for the core (nextstrain-data) source and
``./devData/staging.manifest.json`` and ``./devData/staging.inventory.csv.gz``
for the staging source. Alternately you can manually obtain a suitable manifest
and inventory from ``s3://nextstrain-inventories``
and inventory from ``s3://nextstrain-inventories``

To generate the index using these local files run the indexer with the ``--local`` flag.

Expand Down Expand Up @@ -115,15 +146,15 @@ certain prefixes in that bucket - for instance ``nextstrain-data/config-v1`` and
respectively.

To upload the index you will need write access for
s3://nextstrain-inventories/resources.json.gz. Note that if your aims are
``s3://nextstrain-inventories/resources/*.json.gz``. Note that if your aims are
limited to local development purposes this is not necessary (see `Local development`_).


Index backups
-------------

The ``nextstrain-inventories`` bucket is version enabled so past versions of
``s3://nextstrain-inventories/resources.json.gz`` are available.
``s3://nextstrain-inventories/resources/*.json.gz`` are available.

A lifecycle rule on the s3://nextstrain-inventories bucket (`console link
<https://s3.console.aws.amazon.com/s3/management/nextstrain-inventories/lifecycle/view?region=us-east-1&id=delete+old+versions+of+the+index>`__)
Expand All @@ -136,4 +167,4 @@ Index access by the server

IAM users ``nextstrain.org`` and ``nextstrain.org-testing``, which are under
terraform control, have read access to
s3://nextstrain-inventories/resources.json.gz via their associated policies.
``s3://nextstrain-inventories/resources/*.json.gz`` via their associated policies.
2 changes: 2 additions & 0 deletions env/production/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion env/production/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -110,5 +110,5 @@
"OIDC_GROUPS_CLAIM": "cognito:groups",
"SESSION_COOKIE_DOMAIN": "nextstrain.org",
"GROUPS_DATA_FILE": "groups.json",
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz"
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources/v1.json.gz"
}
2 changes: 2 additions & 0 deletions env/testing/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion env/testing/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,5 @@
"OIDC_USERNAME_CLAIM": "cognito:username",
"OIDC_GROUPS_CLAIM": "cognito:groups",
"GROUPS_DATA_FILE": "groups.json",
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz"
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources/v1.json.gz"
}
112 changes: 112 additions & 0 deletions scripts/get-resource-index-ref-matrix
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/bin/bash
# Builds a ref matrix of commit hashes and resource index URLs for the
# `index-resources.yml` GH Action workflow
#
# Uses the GITHUB_SHA if the workflow was called as a reusable workflow or
# if the workflow was manually run on a branch that is not the default branch.
# Otherwise, checks the Nextstrain production and canary Heroku apps to determine
# if they are using the same `config.RESOURCE_INDEX` and returns a JSON array
# of the the commit hashes for the different RESOURCE_INDEX.
set -euo pipefail

: "${HEROKU_TOKEN:?The HEROKU_TOKEN environment variable is required.}"
: "${GITHUB_EVENT_NAME:?The GITHUB_EVENT_NAME environment variable is required.}"
: "${GITHUB_REF:?The GITHUB_REF environment variable is required.}"
: "${GITHUB_SHA:?The GITHUB_SHA environment variable is required.}"

: "${PROD_APP_NAME:=nextstrain-server}"
: "${CANARY_APP_NAME:=nextstrain-canary}"

main () {
if [[ "$GITHUB_EVENT_NAME" == 'workflow_call' || \
("$GITHUB_EVENT_NAME" == 'workflow_dispatch' && "$GITHUB_REF" != 'refs/heads/master') ]]; then
# This the commit SHA that triggered the workflow.
# For the workflow_call, this is in the context of the calling workflow.
# <https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows>
ref_matrix=$(jq -c --null-input \
--arg RESOURCE_INDEX $(get_resource_index_at_commit "$GITHUB_SHA") \
'[ {"ref": env.GITHUB_SHA, "resource_index": $RESOURCE_INDEX} ]')
else
ref_matrix=$(build_prod_and_canary_ref_matrix)
fi

echo "$ref_matrix"
}

build_prod_and_canary_ref_matrix() {
local prod_commit prod_resource_index
local canary_commit canary_resource_index
local jq_array

prod_commit=$(get_heroku_slug_commit "$PROD_APP_NAME")
prod_resource_index=$(get_resource_index_at_commit "$prod_commit")

canary_commit=$(get_heroku_slug_commit "$CANARY_APP_NAME")
canary_resource_index=$(get_resource_index_at_commit "$canary_commit")

jq -c --null-input \
--arg PROD_COMMIT "$prod_commit" \
--arg PROD_RESOURCE_INDEX "$prod_resource_index" \
--arg CANARY_COMMIT "$canary_commit" \
--arg CANARY_RESOURCE_INDEX "$canary_resource_index" \
'[
{"ref": $PROD_COMMIT, "resource_index": $PROD_RESOURCE_INDEX},
{"ref": $CANARY_COMMIT, "resource_index": $CANARY_RESOURCE_INDEX}
]
| unique_by(.resource_index)'
}

get_heroku_slug_commit() {
local app_name="$1"
local slug_id commit_hash
slug_id=$(curl https://api.heroku.com/apps/"$app_name"/releases \
--fail --silent --show-error \
-H "Accept: application/vnd.heroku+json; version=3" \
-H "Authorization: Bearer $HEROKU_TOKEN" \
-H "Range: version ..; order=desc,max=1" \
| jq -r '.[0].slug.id')

commit_hash=$(curl https://api.heroku.com/apps/"$app_name"/slugs/"$slug_id" \
--fail --silent --show-error \
-H "Accept: application/vnd.heroku+json; version=3" \
-H "Authorization: Bearer $HEROKU_TOKEN" \
| jq -r '.commit')

echo "$commit_hash"
}


get_resource_index_at_commit() {
local commit_hash="$1"
local repo repo_archive resource_index
repo="$(mktemp -dt nextstrain-dot-org-repo-$commit_hash-XXXXXX)"
repo_archive="$repo/nextstrain.org.tar.gz"

trap "rm -rf '$repo'" EXIT

curl -fsSL \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/nextstrain/nextstrain.org/tarball/"$commit_hash" \
> "$repo_archive"

tar xz --file="$repo_archive" \
--strip-components=1 \
-C "$repo"

# This conditional is to continue to support servers that are still
# using versions of this code that do not include the get-resource-index
# script. We can remove it once all the servers have been updated.
# -Jover, 01 May 2024
if [[ -f "$repo/scripts/get-resource-index.js" ]]; then
npm ci --silent --prefix "$repo"
resource_index=$(node "$repo"/scripts/get-resource-index.js)
else
resource_index=$(jq -r '.RESOURCE_INDEX' "$repo"/env/production/config.json)
fi

echo "$resource_index"
}


main "$@"
20 changes: 20 additions & 0 deletions scripts/get-resource-index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Bare bones script to get the `RESOURCE_INDEX` from the config module
*
* This does not address the edge case of the config.RESOURCE_INDEX in the live
* server being different than the config.RESOURCE_INDEX in GitHub Actions, which can
* happen if we set the `RESOURCE_INDEX` envvar on in Heroku.
*
* We can improve on this in the future by surfacing the deployment metadata
* from our servers via our own API endpoint as suggested by @tsibley in
* <https://github.com/nextstrain/nextstrain.org/pull/841#discussion_r1588227761>
*/

import { RESOURCE_INDEX } from '../src/config.js';


function main() {
console.log(RESOURCE_INDEX)
}

main();

0 comments on commit 359f0db

Please sign in to comment.