diff --git a/README.md b/README.md index ad756e4..63f0ebf 100755 --- a/README.md +++ b/README.md @@ -69,10 +69,15 @@ Specifically, the files needed are `ingest/results/metadata.tsv` and `ingest/res Run full genome builds with the following command. ``` bash -nextstrain build . --snakefile Snakefile.genome --config local_ingest=True ingest_source=ncbi +nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + . \ + --snakefile Snakefile.genome \ + --config s3_src=s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi ``` -Currently this is only set up for the "h5n1-cattle-outbreak" build using locally ingested NCBI data, +Currently this is only set up for the "h5n1-cattle-outbreak" build using NCBI data, and the build is restricted to a set of strains where we think there's no reassortment, with outgroups excluded in (`config/dropped_strains_h5n1-cattle-outbreak.txt`). Output files will be placed in `results/h5n1-cattle-outbreak/genome`. diff --git a/Snakefile.genome b/Snakefile.genome index 97de351..bcd02d9 100644 --- a/Snakefile.genome +++ b/Snakefile.genome @@ -1,7 +1,11 @@ include: "rules/common.smk" -assert LOCAL_INGEST == True and INGEST_SOURCE == "ncbi", \ - "Full genome build is only set up for local ingest from 'ncbi'." +if LOCAL_INGEST: + assert INGEST_SOURCE == "ncbi", \ + "Full genome build is only set up for locat ingest from 'ncbi'." +else: + assert S3_SRC.startswith("s3://nextstrain-data/"), \ + "Full genome build is only set up for data from the public S3 bucket" import json diff --git a/ingest/README.md b/ingest/README.md index a69d222..5881825 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -12,14 +12,46 @@ This workflow requires the Nextstrain CLI's Docker runtime which includes [fauna > NOTE: All command examples assume you are within the `ingest` directory. > If running commands from the outer `avian-flu` directory, replace the `.` with `ingest`. -### Ingest data from NCBI GenBank +### Ingest and upload data from public sources to S3 + +#### Ingest NCBI GenBank To download, parse and curate data from NCBI GenBank run the following command. ```sh nextstrain build . ingest_ncbi --configfile build-configs/ncbi/defaults/config.yaml ``` -This results in the files `metadata.tsv`, `sequences_ha.fasta`, etc... under `ingest/ncbi/results/`. +This results in the files `metadata.tsv`, `sequences_ha.fasta`, etc... under `ncbi/results/`. + +#### Ingest from Andersen lab's avian-influenza repo + +Ingest publicly available consensus sequences and metadata from Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza). +Only run this workflow as needed to see the latest available data in the repo. +It does not merge or deduplicate the data the NCBI GenBank workflow. + +```sh +nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/config.yaml +``` + +The results will be available in `andersen-lab/results/`. + +#### Upload to S3 + +To run both NCBI Genbank and Andersent Lab ingests _and_ upload results to S3, +run the following command: + +```sh +nextstrain build \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + . \ + upload_all_ncbi \ + --configfile build-configs/ncbi/defaults/config.yaml +``` + +The workflow compresses and uploads the local files to S3 to corresponding paths +under `s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi` and +`s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab`. ### Ingest and upload data from fauna to S3 @@ -53,17 +85,6 @@ nextstrain build \ . upload_all ``` -### Ingest from Andersen lab's avian-influenza repo - -Ingest publicly available consensus sequences and metadata from Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza). -Only run this workflow as needed to see the latest available data in the repo. -It does not merge or deduplicate the data with the fauna data used in the default ingest workflow. - -```sh -nextstrain build . merge_andersen_segment_metadata -``` - -The results will be available in `andersen-lab/results/`. ## Configuration diff --git a/ingest/Snakefile b/ingest/Snakefile index 6630803..eb3839c 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -3,8 +3,11 @@ path_to_fauna = '../fauna' # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "defaults/config.yaml" +SUPPORTED_DATA_SOURCES = ["fauna", "ncbi", "andersen-lab"] + wildcard_constraints: - segment = "|".join(config["segments"]) + segment = "|".join(config["segments"]), + data_source = "|".join(SUPPORTED_DATA_SOURCES) rule all: # As of 2024-05-16 the default ingest only ingests data from fauna @@ -18,8 +21,9 @@ rule upload_all: sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]), metadata="fauna/s3/metadata.done", -include: "rules/upload_from_fauna.smk" -include: "rules/ingest_andersen_lab.smk" +include: "rules/ingest_fauna.smk" +include: "rules/merge_segment_metadata.smk" +include: "rules/upload_to_s3.smk" # Allow users to import custom rules provided via the config. if "custom_rules" in config: diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile index e3255d2..1b2c62b 100644 --- a/ingest/build-configs/ncbi/Snakefile +++ b/ingest/build-configs/ncbi/Snakefile @@ -5,13 +5,37 @@ workflow and defines its default outputs. # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "build-configs/ncbi/defaults/config.yaml" +# Sanity check that the requested segments match our ncbi_segments map +assert all(segment in config["ncbi_segments"].keys() for segment in config["segments"]) + +NCBI_DATA_SOURCES = ["ncbi", "andersen-lab"] + rule ingest_ncbi: input: expand([ "ncbi/results/sequences_{segment}.fasta", - ], segment=config["ncbi_segments"].keys()), + ], segment=config["segments"]), "ncbi/results/metadata.tsv", + +rule ingest_andersen_lab: + input: + expand([ + "andersen-lab/results/sequences_{segment}.fasta", + ], segment=config["segments"]), + "andersen-lab/results/metadata.tsv", + + +# Uploads all results for NCBI and Andersen Lab ingests +rule upload_all_ncbi: + input: + expand([ + "{data_source}/s3/sequences_{segment}.done", + "{data_source}/s3/metadata.done", + ], data_source=NCBI_DATA_SOURCES, segment=config["segments"]), + + # Include file paths are relative this Snakefile +include: "rules/ingest_andersen_lab.smk" include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" diff --git a/ingest/scripts/curate_andersen_lab_data.py b/ingest/build-configs/ncbi/bin/curate_andersen_lab_data old mode 100644 new mode 100755 similarity index 99% rename from ingest/scripts/curate_andersen_lab_data.py rename to ingest/build-configs/ncbi/bin/curate_andersen_lab_data index a7b62d6..5cc636a --- a/ingest/scripts/curate_andersen_lab_data.py +++ b/ingest/build-configs/ncbi/bin/curate_andersen_lab_data @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ Curate the metadata that originated from Andersen Lab's avian-influenza repo . diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml index 299ecae..af71ec9 100644 --- a/ingest/build-configs/ncbi/defaults/config.yaml +++ b/ingest/build-configs/ncbi/defaults/config.yaml @@ -126,3 +126,9 @@ curate: - gisaid_clade - h5_clade - genbank_accession + +s3_dst: + ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi + andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab + +cloudfront_domain: data.nextstrain.org diff --git a/ingest/build-configs/ncbi/rules/curate.smk b/ingest/build-configs/ncbi/rules/curate.smk index cafd961..0e025ce 100644 --- a/ingest/build-configs/ncbi/rules/curate.smk +++ b/ingest/build-configs/ncbi/rules/curate.smk @@ -148,22 +148,3 @@ rule subset_metadata: tsv-select -H -f {params.metadata_fields} \ {input.metadata} > {output.subset_metadata} """ - - -rule merge_ncbi_segment_metadata: - """ - Add a column "n_segments" which reports how many segments - have sequence data (no QC performed). - """ - input: - segments = expand("ncbi/data/metadata_{segment}.tsv", segment=config["ncbi_segments"]), - metadata = "ncbi/data/metadata_ha.tsv", - output: - metadata = "ncbi/results/metadata.tsv", - shell: - """ - python scripts/add_segment_counts.py \ - --segments {input.segments} \ - --metadata {input.metadata} \ - --output {output.metadata} - """ diff --git a/ingest/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk similarity index 84% rename from ingest/rules/ingest_andersen_lab.smk rename to ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk index 023d547..25dfb51 100644 --- a/ingest/rules/ingest_andersen_lab.smk +++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk @@ -83,7 +83,7 @@ rule curate_metadata: """ augur curate normalize-strings \ --metadata {input.metadata} \ - | python3 ./scripts/curate_andersen_lab_data.py \ + | ./build-configs/ncbi/bin/curate_andersen_lab_data \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.geolocation_rules} \ | augur curate passthru \ @@ -99,7 +99,7 @@ rule match_metadata_and_segment_fasta: metadata = "andersen-lab/data/metadata.tsv", fasta = "andersen-lab/data/{segment}.fasta" output: - metadata = "andersen-lab/results/metadata_{segment}.tsv", + metadata = "andersen-lab/data/metadata_{segment}.tsv", fasta = "andersen-lab/results/sequences_{segment}.fasta" log: "andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt", @@ -118,21 +118,3 @@ rule match_metadata_and_segment_fasta: --output-seq-field sequence \ 2> {log} """ - -rule merge_andersen_segment_metadata: - """ - Add a column "n_segments" which reports how many segments - have sequence data (no QC performed). - """ - input: - segments = expand("andersen-lab/results/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "andersen-lab/results/metadata_ha.tsv", - output: - metadata = "andersen-lab/results/metadata.tsv", - shell: - """ - python scripts/add_segment_counts.py \ - --segments {input.segments} \ - --metadata {input.metadata} \ - --output {output.metadata} - """ diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index c46739f..cdbcfef 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -8,4 +8,5 @@ segments: - mp - ns -s3_dst: "s3://nextstrain-data-private/files/workflows/avian-flu" +s3_dst: + fauna: s3://nextstrain-data-private/files/workflows/avian-flu diff --git a/ingest/rules/ingest_fauna.smk b/ingest/rules/ingest_fauna.smk new file mode 100644 index 0000000..e74ead4 --- /dev/null +++ b/ingest/rules/ingest_fauna.smk @@ -0,0 +1,41 @@ +from pathlib import Path + + +rule download_segment: + output: + sequences = "fauna/data/{segment}.fasta", + params: + fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", + output_dir = lambda wildcards, output: Path(output.sequences).parent, + output_fstem = lambda wildcards, output: Path(output.sequences).stem, + benchmark: + "fauna/benchmarks/download_segment_{segment}.txt" + shell: + """ + python3 {path_to_fauna}/vdb/download.py \ + --database vdb \ + --virus avian_flu \ + --fasta_fields {params.fasta_fields} \ + --select locus:{wildcards.segment} \ + --path {params.output_dir} \ + --fstem {params.output_fstem} + """ + +rule parse_segment: + input: + sequences = "fauna/data/{segment}.fasta", + output: + sequences = "fauna/results/sequences_{segment}.fasta", + metadata = "fauna/data/metadata_{segment}.tsv", + params: + fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", + prettify_fields = "region country division location host originating_lab submitting_lab authors PMID" + shell: + """ + augur parse \ + --sequences {input.sequences} \ + --output-sequences {output.sequences} \ + --output-metadata {output.metadata} \ + --fields {params.fasta_fields} \ + --prettify-fields {params.prettify_fields} + """ diff --git a/ingest/rules/merge_segment_metadata.smk b/ingest/rules/merge_segment_metadata.smk new file mode 100644 index 0000000..d421167 --- /dev/null +++ b/ingest/rules/merge_segment_metadata.smk @@ -0,0 +1,26 @@ +""" +This part of the workflow handles how we merge the metadata for each segment +into a central metadata file. +""" + + +rule merge_segment_metadata: + """ + For each subtype's HA metadata file add a column "n_segments" which reports + how many segments have sequence data (no QC performed). This will force the + download & parsing of all segments for a given subtype. Note that this does + not currently consider the prescribed min lengths (see min_length function) + for each segment, but that would be a nice improvement. + """ + input: + segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "{data_source}/data/metadata_ha.tsv", + output: + metadata = "{data_source}/results/metadata.tsv", + shell: + """ + python scripts/add_segment_counts.py \ + --segments {input.segments} \ + --metadata {input.metadata} \ + --output {output.metadata} + """ diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk deleted file mode 100644 index d072738..0000000 --- a/ingest/rules/upload_from_fauna.smk +++ /dev/null @@ -1,92 +0,0 @@ -from pathlib import Path - - -rule download_segment: - output: - sequences = "fauna/data/{segment}.fasta", - params: - fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", - output_dir = lambda wildcards, output: Path(output.sequences).parent, - output_fstem = lambda wildcards, output: Path(output.sequences).stem, - benchmark: - "fauna/benchmarks/download_segment_{segment}.txt" - shell: - """ - python3 {path_to_fauna}/vdb/download.py \ - --database vdb \ - --virus avian_flu \ - --fasta_fields {params.fasta_fields} \ - --select locus:{wildcards.segment} \ - --path {params.output_dir} \ - --fstem {params.output_fstem} - """ - -rule parse_segment: - input: - sequences = "fauna/data/{segment}.fasta", - output: - sequences = "fauna/results/sequences_{segment}.fasta", - metadata = "fauna/results/metadata_{segment}.tsv", - params: - fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", - prettify_fields = "region country division location host originating_lab submitting_lab authors PMID" - shell: - """ - augur parse \ - --sequences {input.sequences} \ - --output-sequences {output.sequences} \ - --output-metadata {output.metadata} \ - --fields {params.fasta_fields} \ - --prettify-fields {params.prettify_fields} - """ - -rule merge_segment_metadata: - """ - For each subtype's HA metadata file add a column "n_segments" which reports - how many segments have sequence data (no QC performed). This will force the - download & parsing of all segments for a given subtype. Note that this does - not currently consider the prescribed min lengths (see min_length function) - for each segment, but that would be a nice improvement. - """ - input: - segments = expand("fauna/results/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "fauna/results/metadata_ha.tsv", - output: - metadata = "fauna/results/metadata.tsv", - shell: - """ - python scripts/add_segment_counts.py \ - --segments {input.segments} \ - --metadata {input.metadata} \ - --output {output.metadata} - """ - -rule upload_sequences: - input: - sequences="fauna/results/sequences_{segment}.fasta", - output: - flag=touch("fauna/s3/sequences_{segment}.done"), - params: - s3_dst=config["s3_dst"], - shell: - """ - zstd -c {input.sequences:q} \ - | aws s3 cp \ - - \ - {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst - """ - -rule upload_metadata: - input: - metadata="fauna/results/metadata.tsv", - output: - flag=touch("fauna/s3/metadata.done"), - params: - s3_dst=config["s3_dst"], - shell: - """ - zstd -c {input.metadata:q} \ - | aws s3 cp \ - - \ - {params.s3_dst:q}/metadata.tsv.zst - """ diff --git a/ingest/rules/upload_to_s3.smk b/ingest/rules/upload_to_s3.smk new file mode 100644 index 0000000..5fb123c --- /dev/null +++ b/ingest/rules/upload_to_s3.smk @@ -0,0 +1,39 @@ +""" +This part of the workflow handles uploading files to AWS S3. +""" + + +rule upload_sequences: + input: + sequences="{data_source}/results/sequences_{segment}.fasta", + output: + flag="{data_source}/s3/sequences_{segment}.done", + params: + s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source], + cloudfront_domain=config.get("cloudfront_domain", ""), + shell: + """ + ./vendored/upload-to-s3 \ + --quiet \ + {input.sequences:q} \ + {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst \ + {params.cloudfront_domain} 2>&1 | tee {output.flag} + """ + + +rule upload_metadata: + input: + metadata="{data_source}/results/metadata.tsv", + output: + flag="{data_source}/s3/metadata.done", + params: + s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source], + cloudfront_domain=config.get("cloudfront_domain", ""), + shell: + """ + ./vendored/upload-to-s3 \ + --quiet \ + {input.metadata:q} \ + {params.s3_dst:q}/metadata.tsv.zst \ + {params.cloudfront_domain} 2>&1 | tee {output.flag} + """