From 4c044eaec4b42ab8a21a6cb8d68f8382a5af4dad Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 13:38:34 -0700 Subject: [PATCH 1/9] ingest: Centralize rules for merging segment metadata All three data sources go through the same process of merging metadata per segment into a single metadata TSV. Deduplicate the rules and make sure we are merging the data the same way so that the final output metadata has the same format across all data sources. I suspect the process for merging the metadata will grow over time as we add QC checks, so I've pulled this out into a completely separate rules file. --- ingest/Snakefile | 6 ++++- ingest/build-configs/ncbi/rules/curate.smk | 19 ---------------- ingest/rules/ingest_andersen_lab.smk | 20 +---------------- ingest/rules/merge_segment_metadata.smk | 26 ++++++++++++++++++++++ ingest/rules/upload_from_fauna.smk | 23 +------------------ 5 files changed, 33 insertions(+), 61 deletions(-) create mode 100644 ingest/rules/merge_segment_metadata.smk diff --git a/ingest/Snakefile b/ingest/Snakefile index 6630803..35c3fea 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -3,8 +3,11 @@ path_to_fauna = '../fauna' # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "defaults/config.yaml" +SUPPORTED_DATA_SOURCES = ["fauna", "ncbi", "andersen-lab"] + wildcard_constraints: - segment = "|".join(config["segments"]) + segment = "|".join(config["segments"]), + data_source = "|".join(SUPPORTED_DATA_SOURCES) rule all: # As of 2024-05-16 the default ingest only ingests data from fauna @@ -20,6 +23,7 @@ rule upload_all: include: "rules/upload_from_fauna.smk" include: "rules/ingest_andersen_lab.smk" +include: "rules/merge_segment_metadata.smk" # Allow users to import custom rules provided via the config. if "custom_rules" in config: diff --git a/ingest/build-configs/ncbi/rules/curate.smk b/ingest/build-configs/ncbi/rules/curate.smk index cafd961..0e025ce 100644 --- a/ingest/build-configs/ncbi/rules/curate.smk +++ b/ingest/build-configs/ncbi/rules/curate.smk @@ -148,22 +148,3 @@ rule subset_metadata: tsv-select -H -f {params.metadata_fields} \ {input.metadata} > {output.subset_metadata} """ - - -rule merge_ncbi_segment_metadata: - """ - Add a column "n_segments" which reports how many segments - have sequence data (no QC performed). - """ - input: - segments = expand("ncbi/data/metadata_{segment}.tsv", segment=config["ncbi_segments"]), - metadata = "ncbi/data/metadata_ha.tsv", - output: - metadata = "ncbi/results/metadata.tsv", - shell: - """ - python scripts/add_segment_counts.py \ - --segments {input.segments} \ - --metadata {input.metadata} \ - --output {output.metadata} - """ diff --git a/ingest/rules/ingest_andersen_lab.smk b/ingest/rules/ingest_andersen_lab.smk index 023d547..51bbacd 100644 --- a/ingest/rules/ingest_andersen_lab.smk +++ b/ingest/rules/ingest_andersen_lab.smk @@ -99,7 +99,7 @@ rule match_metadata_and_segment_fasta: metadata = "andersen-lab/data/metadata.tsv", fasta = "andersen-lab/data/{segment}.fasta" output: - metadata = "andersen-lab/results/metadata_{segment}.tsv", + metadata = "andersen-lab/data/metadata_{segment}.tsv", fasta = "andersen-lab/results/sequences_{segment}.fasta" log: "andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt", @@ -118,21 +118,3 @@ rule match_metadata_and_segment_fasta: --output-seq-field sequence \ 2> {log} """ - -rule merge_andersen_segment_metadata: - """ - Add a column "n_segments" which reports how many segments - have sequence data (no QC performed). - """ - input: - segments = expand("andersen-lab/results/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "andersen-lab/results/metadata_ha.tsv", - output: - metadata = "andersen-lab/results/metadata.tsv", - shell: - """ - python scripts/add_segment_counts.py \ - --segments {input.segments} \ - --metadata {input.metadata} \ - --output {output.metadata} - """ diff --git a/ingest/rules/merge_segment_metadata.smk b/ingest/rules/merge_segment_metadata.smk new file mode 100644 index 0000000..d421167 --- /dev/null +++ b/ingest/rules/merge_segment_metadata.smk @@ -0,0 +1,26 @@ +""" +This part of the workflow handles how we merge the metadata for each segment +into a central metadata file. +""" + + +rule merge_segment_metadata: + """ + For each subtype's HA metadata file add a column "n_segments" which reports + how many segments have sequence data (no QC performed). This will force the + download & parsing of all segments for a given subtype. Note that this does + not currently consider the prescribed min lengths (see min_length function) + for each segment, but that would be a nice improvement. + """ + input: + segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "{data_source}/data/metadata_ha.tsv", + output: + metadata = "{data_source}/results/metadata.tsv", + shell: + """ + python scripts/add_segment_counts.py \ + --segments {input.segments} \ + --metadata {input.metadata} \ + --output {output.metadata} + """ diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk index d072738..c4dce80 100644 --- a/ingest/rules/upload_from_fauna.smk +++ b/ingest/rules/upload_from_fauna.smk @@ -26,7 +26,7 @@ rule parse_segment: sequences = "fauna/data/{segment}.fasta", output: sequences = "fauna/results/sequences_{segment}.fasta", - metadata = "fauna/results/metadata_{segment}.tsv", + metadata = "fauna/data/metadata_{segment}.tsv", params: fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", prettify_fields = "region country division location host originating_lab submitting_lab authors PMID" @@ -40,27 +40,6 @@ rule parse_segment: --prettify-fields {params.prettify_fields} """ -rule merge_segment_metadata: - """ - For each subtype's HA metadata file add a column "n_segments" which reports - how many segments have sequence data (no QC performed). This will force the - download & parsing of all segments for a given subtype. Note that this does - not currently consider the prescribed min lengths (see min_length function) - for each segment, but that would be a nice improvement. - """ - input: - segments = expand("fauna/results/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "fauna/results/metadata_ha.tsv", - output: - metadata = "fauna/results/metadata.tsv", - shell: - """ - python scripts/add_segment_counts.py \ - --segments {input.segments} \ - --metadata {input.metadata} \ - --output {output.metadata} - """ - rule upload_sequences: input: sequences="fauna/results/sequences_{segment}.fasta", From da5d9e05c7657133d7c577413946a8c53026f6e9 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 14:27:46 -0700 Subject: [PATCH 2/9] ingest: Separate upload rules from fauna ingest Use wildcards to make the upload rules data source agnostic. Nest each data source's wildcard name under the `s3_dst` config param so that it's easier to upload to different S3 URLs. This will allow us to run NCBI and Andersen lab ingests in parallel when we eventually want to join their data. --- ingest/Snakefile | 1 + .../build-configs/ncbi/defaults/config.yaml | 3 ++ ingest/defaults/config.yaml | 4 ++- ingest/rules/upload_from_fauna.smk | 30 ---------------- ingest/rules/upload_to_s3.smk | 35 +++++++++++++++++++ 5 files changed, 42 insertions(+), 31 deletions(-) create mode 100644 ingest/rules/upload_to_s3.smk diff --git a/ingest/Snakefile b/ingest/Snakefile index 35c3fea..3d04b62 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -24,6 +24,7 @@ rule upload_all: include: "rules/upload_from_fauna.smk" include: "rules/ingest_andersen_lab.smk" include: "rules/merge_segment_metadata.smk" +include: "rules/upload_to_s3.smk" # Allow users to import custom rules provided via the config. if "custom_rules" in config: diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml index 299ecae..1049618 100644 --- a/ingest/build-configs/ncbi/defaults/config.yaml +++ b/ingest/build-configs/ncbi/defaults/config.yaml @@ -126,3 +126,6 @@ curate: - gisaid_clade - h5_clade - genbank_accession + +s3_dst: + ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index c46739f..1d2ebf5 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -8,4 +8,6 @@ segments: - mp - ns -s3_dst: "s3://nextstrain-data-private/files/workflows/avian-flu" +s3_dst: + fauna: s3://nextstrain-data-private/files/workflows/avian-flu + andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk index c4dce80..e74ead4 100644 --- a/ingest/rules/upload_from_fauna.smk +++ b/ingest/rules/upload_from_fauna.smk @@ -39,33 +39,3 @@ rule parse_segment: --fields {params.fasta_fields} \ --prettify-fields {params.prettify_fields} """ - -rule upload_sequences: - input: - sequences="fauna/results/sequences_{segment}.fasta", - output: - flag=touch("fauna/s3/sequences_{segment}.done"), - params: - s3_dst=config["s3_dst"], - shell: - """ - zstd -c {input.sequences:q} \ - | aws s3 cp \ - - \ - {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst - """ - -rule upload_metadata: - input: - metadata="fauna/results/metadata.tsv", - output: - flag=touch("fauna/s3/metadata.done"), - params: - s3_dst=config["s3_dst"], - shell: - """ - zstd -c {input.metadata:q} \ - | aws s3 cp \ - - \ - {params.s3_dst:q}/metadata.tsv.zst - """ diff --git a/ingest/rules/upload_to_s3.smk b/ingest/rules/upload_to_s3.smk new file mode 100644 index 0000000..67dceb7 --- /dev/null +++ b/ingest/rules/upload_to_s3.smk @@ -0,0 +1,35 @@ +""" +This part of the workflow handles uploading files to AWS S3. +""" + + +rule upload_sequences: + input: + sequences="{data_source}/results/sequences_{segment}.fasta", + output: + flag=touch("{data_source}/s3/sequences_{segment}.done"), + params: + s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source], + shell: + """ + zstd -c {input.sequences:q} \ + | aws s3 cp \ + - \ + {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst + """ + + +rule upload_metadata: + input: + metadata="{data_source}/results/metadata.tsv", + output: + flag=touch("{data_source}/s3/metadata.done"), + params: + s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source], + shell: + """ + zstd -c {input.metadata:q} \ + | aws s3 cp \ + - \ + {params.s3_dst:q}/metadata.tsv.zst + """ From f5b610c3f5f181ca1d0a1dd1d11a30c824be013d Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 15:17:55 -0700 Subject: [PATCH 3/9] ingest: Rename fauna rule file to reflect reality --- ingest/Snakefile | 2 +- ingest/rules/{upload_from_fauna.smk => ingest_fauna.smk} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename ingest/rules/{upload_from_fauna.smk => ingest_fauna.smk} (100%) diff --git a/ingest/Snakefile b/ingest/Snakefile index 3d04b62..1e3cc0d 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -21,7 +21,7 @@ rule upload_all: sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]), metadata="fauna/s3/metadata.done", -include: "rules/upload_from_fauna.smk" +include: "rules/ingest_fauna.smk" include: "rules/ingest_andersen_lab.smk" include: "rules/merge_segment_metadata.smk" include: "rules/upload_to_s3.smk" diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/ingest_fauna.smk similarity index 100% rename from ingest/rules/upload_from_fauna.smk rename to ingest/rules/ingest_fauna.smk From 04b6a95eb34cc3f7c11ae4beb0dc2eeac8043c6f Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 16:08:50 -0700 Subject: [PATCH 4/9] ingest: Move Andersen Lab under build-configs/ncbi/ We are planning to run this in parallel with the NCBI ingest and eventually merge their data. They will also share more config params with each other than with the default fauna ingest. Andersen Lab is using NCBI SRA data, so I think it makes sense to be under the NCBI umbrella. --- ingest/README.md | 2 +- ingest/Snakefile | 1 - ingest/build-configs/ncbi/Snakefile | 10 ++++++++++ .../ncbi/bin/curate_andersen_lab_data} | 1 + ingest/build-configs/ncbi/defaults/config.yaml | 1 + .../ncbi}/rules/ingest_andersen_lab.smk | 2 +- ingest/defaults/config.yaml | 1 - 7 files changed, 14 insertions(+), 4 deletions(-) rename ingest/{scripts/curate_andersen_lab_data.py => build-configs/ncbi/bin/curate_andersen_lab_data} (99%) mode change 100644 => 100755 rename ingest/{ => build-configs/ncbi}/rules/ingest_andersen_lab.smk (98%) diff --git a/ingest/README.md b/ingest/README.md index a69d222..e85812b 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -60,7 +60,7 @@ Only run this workflow as needed to see the latest available data in the repo. It does not merge or deduplicate the data with the fauna data used in the default ingest workflow. ```sh -nextstrain build . merge_andersen_segment_metadata +nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/config.yaml ``` The results will be available in `andersen-lab/results/`. diff --git a/ingest/Snakefile b/ingest/Snakefile index 1e3cc0d..eb3839c 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -22,7 +22,6 @@ rule upload_all: metadata="fauna/s3/metadata.done", include: "rules/ingest_fauna.smk" -include: "rules/ingest_andersen_lab.smk" include: "rules/merge_segment_metadata.smk" include: "rules/upload_to_s3.smk" diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile index e3255d2..881a2e9 100644 --- a/ingest/build-configs/ncbi/Snakefile +++ b/ingest/build-configs/ncbi/Snakefile @@ -12,6 +12,16 @@ rule ingest_ncbi: ], segment=config["ncbi_segments"].keys()), "ncbi/results/metadata.tsv", + +rule ingest_andersen_lab: + input: + expand([ + "andersen-lab/results/sequences_{segment}.fasta", + ], segment=config["segments"]), + "andersen-lab/results/metadata.tsv", + + # Include file paths are relative this Snakefile +include: "rules/ingest_andersen_lab.smk" include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" diff --git a/ingest/scripts/curate_andersen_lab_data.py b/ingest/build-configs/ncbi/bin/curate_andersen_lab_data old mode 100644 new mode 100755 similarity index 99% rename from ingest/scripts/curate_andersen_lab_data.py rename to ingest/build-configs/ncbi/bin/curate_andersen_lab_data index a7b62d6..5cc636a --- a/ingest/scripts/curate_andersen_lab_data.py +++ b/ingest/build-configs/ncbi/bin/curate_andersen_lab_data @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ Curate the metadata that originated from Andersen Lab's avian-influenza repo . diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml index 1049618..ad7ee87 100644 --- a/ingest/build-configs/ncbi/defaults/config.yaml +++ b/ingest/build-configs/ncbi/defaults/config.yaml @@ -129,3 +129,4 @@ curate: s3_dst: ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi + andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab diff --git a/ingest/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk similarity index 98% rename from ingest/rules/ingest_andersen_lab.smk rename to ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index 51bbacd..25dfb51 100644 --- a/ingest/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -83,7 +83,7 @@ rule curate_metadata: """ augur curate normalize-strings \ --metadata {input.metadata} \ - | python3 ./scripts/curate_andersen_lab_data.py \ + | ./build-configs/ncbi/bin/curate_andersen_lab_data \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.geolocation_rules} \ | augur curate passthru \ diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 1d2ebf5..cdbcfef 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -10,4 +10,3 @@ segments: s3_dst: fauna: s3://nextstrain-data-private/files/workflows/avian-flu - andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab From d1ebb406810840c8925df0256a0bd33f956432d4 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 16:41:46 -0700 Subject: [PATCH 5/9] ingest/build-configs/ncbi: Use `segments` config param for default rule Use the `segments` param to determine the default outputs for `ingest_ncbi` to match the `merge_segment_metadata` rule. Adds a sanity check that the requested segments are represented in the `ncbi_segments` map. --- ingest/build-configs/ncbi/Snakefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile index 881a2e9..14b209f 100644 --- a/ingest/build-configs/ncbi/Snakefile +++ b/ingest/build-configs/ncbi/Snakefile @@ -5,11 +5,15 @@ workflow and defines its default outputs. # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "build-configs/ncbi/defaults/config.yaml" +# Sanity check that the requested segments match our ncbi_segments map +assert all(segment in config["ncbi_segments"].keys() for segment in config["segments"]) + + rule ingest_ncbi: input: expand([ "ncbi/results/sequences_{segment}.fasta", - ], segment=config["ncbi_segments"].keys()), + ], segment=config["segments"]), "ncbi/results/metadata.tsv", From 2c0cbcbb08c714f154dbd8af87ae5e53110aa1b4 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 16:54:01 -0700 Subject: [PATCH 6/9] ingest/build-configs/ncbi: Add target `upload_all_ncbi` The new target `upload_all_ncbi` will run and upload all files for the NCBI and Andersen lab ingests. I didn't see a need to individual rules to upload each data source (i.e. upload_ncbi and upload_andersen_lab) but they can be added in the future as needed. --- ingest/build-configs/ncbi/Snakefile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile index 14b209f..1b2c62b 100644 --- a/ingest/build-configs/ncbi/Snakefile +++ b/ingest/build-configs/ncbi/Snakefile @@ -8,6 +8,7 @@ configfile: "build-configs/ncbi/defaults/config.yaml" # Sanity check that the requested segments match our ncbi_segments map assert all(segment in config["ncbi_segments"].keys() for segment in config["segments"]) +NCBI_DATA_SOURCES = ["ncbi", "andersen-lab"] rule ingest_ncbi: input: @@ -25,6 +26,15 @@ rule ingest_andersen_lab: "andersen-lab/results/metadata.tsv", +# Uploads all results for NCBI and Andersen Lab ingests +rule upload_all_ncbi: + input: + expand([ + "{data_source}/s3/sequences_{segment}.done", + "{data_source}/s3/metadata.done", + ], data_source=NCBI_DATA_SOURCES, segment=config["segments"]), + + # Include file paths are relative this Snakefile include: "rules/ingest_andersen_lab.smk" include: "rules/fetch_from_ncbi.smk" From 1bfeb779bceb3305f4b94f7441f7562e3563c2d9 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 17:09:04 -0700 Subject: [PATCH 7/9] ingest/README: Update with instructions for public sources --- ingest/README.md | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/ingest/README.md b/ingest/README.md index e85812b..5881825 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -12,14 +12,46 @@ This workflow requires the Nextstrain CLI's Docker runtime which includes [fauna > NOTE: All command examples assume you are within the `ingest` directory. > If running commands from the outer `avian-flu` directory, replace the `.` with `ingest`. -### Ingest data from NCBI GenBank +### Ingest and upload data from public sources to S3 + +#### Ingest NCBI GenBank To download, parse and curate data from NCBI GenBank run the following command. ```sh nextstrain build . ingest_ncbi --configfile build-configs/ncbi/defaults/config.yaml ``` -This results in the files `metadata.tsv`, `sequences_ha.fasta`, etc... under `ingest/ncbi/results/`. +This results in the files `metadata.tsv`, `sequences_ha.fasta`, etc... under `ncbi/results/`. + +#### Ingest from Andersen lab's avian-influenza repo + +Ingest publicly available consensus sequences and metadata from Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza). +Only run this workflow as needed to see the latest available data in the repo. +It does not merge or deduplicate the data the NCBI GenBank workflow. + +```sh +nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/config.yaml +``` + +The results will be available in `andersen-lab/results/`. + +#### Upload to S3 + +To run both NCBI Genbank and Andersent Lab ingests _and_ upload results to S3, +run the following command: + +```sh +nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + . \ + upload_all_ncbi \ + --configfile build-configs/ncbi/defaults/config.yaml +``` + +The workflow compresses and uploads the local files to S3 to corresponding paths +under `s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi` and +`s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab`. ### Ingest and upload data from fauna to S3 @@ -53,17 +85,6 @@ nextstrain build \ . upload_all ``` -### Ingest from Andersen lab's avian-influenza repo - -Ingest publicly available consensus sequences and metadata from Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza). -Only run this workflow as needed to see the latest available data in the repo. -It does not merge or deduplicate the data with the fauna data used in the default ingest workflow. - -```sh -nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/config.yaml -``` - -The results will be available in `andersen-lab/results/`. ## Configuration From 600da7725c1660c2f204c40a82261d2253a10dce Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 15:35:59 -0700 Subject: [PATCH 8/9] ingest: Switch to vendered/upload-to-s3 Replace the `aws s3 cp` commands with the vendored/upload-to-s3 script to use a couple of its built-in features: - CloudFront invalidation - add Metadata.sha256sum to be able to track file changes --- .../build-configs/ncbi/defaults/config.yaml | 2 ++ ingest/rules/upload_to_s3.smk | 24 +++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml index ad7ee87..af71ec9 100644 --- a/ingest/build-configs/ncbi/defaults/config.yaml +++ b/ingest/build-configs/ncbi/defaults/config.yaml @@ -130,3 +130,5 @@ curate: s3_dst: ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab + +cloudfront_domain: data.nextstrain.org diff --git a/ingest/rules/upload_to_s3.smk b/ingest/rules/upload_to_s3.smk index 67dceb7..5fb123c 100644 --- a/ingest/rules/upload_to_s3.smk +++ b/ingest/rules/upload_to_s3.smk @@ -7,15 +7,17 @@ rule upload_sequences: input: sequences="{data_source}/results/sequences_{segment}.fasta", output: - flag=touch("{data_source}/s3/sequences_{segment}.done"), + flag="{data_source}/s3/sequences_{segment}.done", params: s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source], + cloudfront_domain=config.get("cloudfront_domain", ""), shell: """ - zstd -c {input.sequences:q} \ - | aws s3 cp \ - - \ - {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst + ./vendored/upload-to-s3 \ + --quiet \ + {input.sequences:q} \ + {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst \ + {params.cloudfront_domain} 2>&1 | tee {output.flag} """ @@ -23,13 +25,15 @@ rule upload_metadata: input: metadata="{data_source}/results/metadata.tsv", output: - flag=touch("{data_source}/s3/metadata.done"), + flag="{data_source}/s3/metadata.done", params: s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source], + cloudfront_domain=config.get("cloudfront_domain", ""), shell: """ - zstd -c {input.metadata:q} \ - | aws s3 cp \ - - \ - {params.s3_dst:q}/metadata.tsv.zst + ./vendored/upload-to-s3 \ + --quiet \ + {input.metadata:q} \ + {params.s3_dst:q}/metadata.tsv.zst \ + {params.cloudfront_domain} 2>&1 | tee {output.flag} """ From e86aa9731dd707a34c8581c446579c8623ce75ce Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 31 May 2024 17:40:15 -0700 Subject: [PATCH 9/9] Update Snakemake.genome build to accept S3_SRC for public data --- README.md | 9 +++++++-- Snakefile.genome | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ad756e4..63f0ebf 100755 --- a/README.md +++ b/README.md @@ -69,10 +69,15 @@ Specifically, the files needed are `ingest/results/metadata.tsv` and `ingest/res Run full genome builds with the following command. ``` bash -nextstrain build . --snakefile Snakefile.genome --config local_ingest=True ingest_source=ncbi +nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + . \ + --snakefile Snakefile.genome \ + --config s3_src=s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi ``` -Currently this is only set up for the "h5n1-cattle-outbreak" build using locally ingested NCBI data, +Currently this is only set up for the "h5n1-cattle-outbreak" build using NCBI data, and the build is restricted to a set of strains where we think there's no reassortment, with outgroups excluded in (`config/dropped_strains_h5n1-cattle-outbreak.txt`). Output files will be placed in `results/h5n1-cattle-outbreak/genome`. diff --git a/Snakefile.genome b/Snakefile.genome index 97de351..bcd02d9 100644 --- a/Snakefile.genome +++ b/Snakefile.genome @@ -1,7 +1,11 @@ include: "rules/common.smk" -assert LOCAL_INGEST == True and INGEST_SOURCE == "ncbi", \ - "Full genome build is only set up for local ingest from 'ncbi'." +if LOCAL_INGEST: + assert INGEST_SOURCE == "ncbi", \ + "Full genome build is only set up for locat ingest from 'ncbi'." +else: + assert S3_SRC.startswith("s3://nextstrain-data/"), \ + "Full genome build is only set up for data from the public S3 bucket" import json