From 4c044eaec4b42ab8a21a6cb8d68f8382a5af4dad Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 13:38:34 -0700
Subject: [PATCH 1/9] ingest: Centralize rules for merging segment metadata

All three data sources go through the same process of merging
metadata per segment into a single metadata TSV. Deduplicate the
rules and make sure we are merging the data the same way so that
the final output metadata has the same format across all data sources.

I suspect the process for merging the metadata will grow over time as we
add QC checks, so I've pulled this out into a completely separate rules
file.
---
 ingest/Snakefile                           |  6 ++++-
 ingest/build-configs/ncbi/rules/curate.smk | 19 ----------------
 ingest/rules/ingest_andersen_lab.smk       | 20 +----------------
 ingest/rules/merge_segment_metadata.smk    | 26 ++++++++++++++++++++++
 ingest/rules/upload_from_fauna.smk         | 23 +------------------
 5 files changed, 33 insertions(+), 61 deletions(-)
 create mode 100644 ingest/rules/merge_segment_metadata.smk

diff --git a/ingest/Snakefile b/ingest/Snakefile
index 6630803..35c3fea 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -3,8 +3,11 @@ path_to_fauna = '../fauna'
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "defaults/config.yaml"
 
+SUPPORTED_DATA_SOURCES = ["fauna", "ncbi", "andersen-lab"]
+
 wildcard_constraints:
-    segment = "|".join(config["segments"])
+    segment = "|".join(config["segments"]),
+    data_source = "|".join(SUPPORTED_DATA_SOURCES)
 
 rule all:
     # As of 2024-05-16 the default ingest only ingests data from fauna
@@ -20,6 +23,7 @@ rule upload_all:
 
 include: "rules/upload_from_fauna.smk"
 include: "rules/ingest_andersen_lab.smk"
+include: "rules/merge_segment_metadata.smk"
 
 # Allow users to import custom rules provided via the config.
 if "custom_rules" in config:
diff --git a/ingest/build-configs/ncbi/rules/curate.smk b/ingest/build-configs/ncbi/rules/curate.smk
index cafd961..0e025ce 100644
--- a/ingest/build-configs/ncbi/rules/curate.smk
+++ b/ingest/build-configs/ncbi/rules/curate.smk
@@ -148,22 +148,3 @@ rule subset_metadata:
         tsv-select -H -f {params.metadata_fields} \
             {input.metadata} > {output.subset_metadata}
         """
-
-
-rule merge_ncbi_segment_metadata:
-    """
-    Add a column "n_segments" which reports how many segments
-    have sequence data (no QC performed).
-    """
-    input:
-        segments = expand("ncbi/data/metadata_{segment}.tsv", segment=config["ncbi_segments"]),
-        metadata = "ncbi/data/metadata_ha.tsv",
-    output:
-        metadata = "ncbi/results/metadata.tsv",
-    shell:
-        """
-        python scripts/add_segment_counts.py \
-            --segments {input.segments} \
-            --metadata {input.metadata} \
-            --output {output.metadata}
-        """
diff --git a/ingest/rules/ingest_andersen_lab.smk b/ingest/rules/ingest_andersen_lab.smk
index 023d547..51bbacd 100644
--- a/ingest/rules/ingest_andersen_lab.smk
+++ b/ingest/rules/ingest_andersen_lab.smk
@@ -99,7 +99,7 @@ rule match_metadata_and_segment_fasta:
         metadata = "andersen-lab/data/metadata.tsv",
         fasta = "andersen-lab/data/{segment}.fasta"
     output:
-        metadata = "andersen-lab/results/metadata_{segment}.tsv",
+        metadata = "andersen-lab/data/metadata_{segment}.tsv",
         fasta = "andersen-lab/results/sequences_{segment}.fasta"
     log:
         "andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt",
@@ -118,21 +118,3 @@ rule match_metadata_and_segment_fasta:
             --output-seq-field sequence \
             2> {log}
         """
-
-rule merge_andersen_segment_metadata:
-    """
-    Add a column "n_segments" which reports how many segments
-    have sequence data (no QC performed).
-    """
-    input:
-        segments = expand("andersen-lab/results/metadata_{segment}.tsv", segment=config["segments"]),
-        metadata = "andersen-lab/results/metadata_ha.tsv",
-    output:
-        metadata = "andersen-lab/results/metadata.tsv",
-    shell:
-        """
-        python scripts/add_segment_counts.py \
-            --segments {input.segments} \
-            --metadata {input.metadata} \
-            --output {output.metadata}
-        """
diff --git a/ingest/rules/merge_segment_metadata.smk b/ingest/rules/merge_segment_metadata.smk
new file mode 100644
index 0000000..d421167
--- /dev/null
+++ b/ingest/rules/merge_segment_metadata.smk
@@ -0,0 +1,26 @@
+"""
+This part of the workflow handles how we merge the metadata for each segment
+into a central metadata file.
+"""
+
+
+rule merge_segment_metadata:
+    """
+    For each subtype's HA metadata file add a column "n_segments" which reports
+    how many segments have sequence data (no QC performed). This will force the
+    download & parsing of all segments for a given subtype. Note that this does
+    not currently consider the prescribed min lengths (see min_length function)
+    for each segment, but that would be a nice improvement.
+    """
+    input:
+        segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]),
+        metadata = "{data_source}/data/metadata_ha.tsv",
+    output:
+        metadata = "{data_source}/results/metadata.tsv",
+    shell:
+        """
+        python scripts/add_segment_counts.py \
+            --segments {input.segments} \
+            --metadata {input.metadata} \
+            --output {output.metadata}
+        """
diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk
index d072738..c4dce80 100644
--- a/ingest/rules/upload_from_fauna.smk
+++ b/ingest/rules/upload_from_fauna.smk
@@ -26,7 +26,7 @@ rule parse_segment:
         sequences = "fauna/data/{segment}.fasta",
     output:
         sequences = "fauna/results/sequences_{segment}.fasta",
-        metadata = "fauna/results/metadata_{segment}.tsv",
+        metadata = "fauna/data/metadata_{segment}.tsv",
     params:
         fasta_fields =  "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade",
         prettify_fields = "region country division location host originating_lab submitting_lab authors PMID"
@@ -40,27 +40,6 @@ rule parse_segment:
             --prettify-fields {params.prettify_fields}
         """
 
-rule merge_segment_metadata:
-    """
-    For each subtype's HA metadata file add a column "n_segments" which reports
-    how many segments have sequence data (no QC performed). This will force the
-    download & parsing of all segments for a given subtype. Note that this does
-    not currently consider the prescribed min lengths (see min_length function)
-    for each segment, but that would be a nice improvement.
-    """
-    input:
-        segments = expand("fauna/results/metadata_{segment}.tsv", segment=config["segments"]),
-        metadata = "fauna/results/metadata_ha.tsv",
-    output:
-        metadata = "fauna/results/metadata.tsv",
-    shell:
-        """
-        python scripts/add_segment_counts.py \
-            --segments {input.segments} \
-            --metadata {input.metadata} \
-            --output {output.metadata}
-        """
-
 rule upload_sequences:
     input:
         sequences="fauna/results/sequences_{segment}.fasta",

From da5d9e05c7657133d7c577413946a8c53026f6e9 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 14:27:46 -0700
Subject: [PATCH 2/9] ingest: Separate upload rules from fauna ingest

Use wildcards to make the upload rules data source agnostic.
Nest each data source's wildcard name under the `s3_dst` config param
so that it's easier to upload to different S3 URLs. This will allow us
to run NCBI and Andersen lab ingests in parallel when we eventually
want to join their data.
---
 ingest/Snakefile                              |  1 +
 .../build-configs/ncbi/defaults/config.yaml   |  3 ++
 ingest/defaults/config.yaml                   |  4 ++-
 ingest/rules/upload_from_fauna.smk            | 30 ----------------
 ingest/rules/upload_to_s3.smk                 | 35 +++++++++++++++++++
 5 files changed, 42 insertions(+), 31 deletions(-)
 create mode 100644 ingest/rules/upload_to_s3.smk

diff --git a/ingest/Snakefile b/ingest/Snakefile
index 35c3fea..3d04b62 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -24,6 +24,7 @@ rule upload_all:
 include: "rules/upload_from_fauna.smk"
 include: "rules/ingest_andersen_lab.smk"
 include: "rules/merge_segment_metadata.smk"
+include: "rules/upload_to_s3.smk"
 
 # Allow users to import custom rules provided via the config.
 if "custom_rules" in config:
diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml
index 299ecae..1049618 100644
--- a/ingest/build-configs/ncbi/defaults/config.yaml
+++ b/ingest/build-configs/ncbi/defaults/config.yaml
@@ -126,3 +126,6 @@ curate:
   - gisaid_clade
   - h5_clade
   - genbank_accession
+
+s3_dst:
+  ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index c46739f..1d2ebf5 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -8,4 +8,6 @@ segments:
   - mp
   - ns
 
-s3_dst: "s3://nextstrain-data-private/files/workflows/avian-flu"
+s3_dst:
+  fauna: s3://nextstrain-data-private/files/workflows/avian-flu
+  andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab
diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk
index c4dce80..e74ead4 100644
--- a/ingest/rules/upload_from_fauna.smk
+++ b/ingest/rules/upload_from_fauna.smk
@@ -39,33 +39,3 @@ rule parse_segment:
             --fields {params.fasta_fields} \
             --prettify-fields {params.prettify_fields}
         """
-
-rule upload_sequences:
-    input:
-        sequences="fauna/results/sequences_{segment}.fasta",
-    output:
-        flag=touch("fauna/s3/sequences_{segment}.done"),
-    params:
-        s3_dst=config["s3_dst"],
-    shell:
-        """
-        zstd -c {input.sequences:q} \
-            | aws s3 cp \
-                  - \
-                  {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst
-        """
-
-rule upload_metadata:
-    input:
-        metadata="fauna/results/metadata.tsv",
-    output:
-        flag=touch("fauna/s3/metadata.done"),
-    params:
-        s3_dst=config["s3_dst"],
-    shell:
-        """
-        zstd -c {input.metadata:q} \
-            | aws s3 cp \
-                  - \
-                  {params.s3_dst:q}/metadata.tsv.zst
-        """
diff --git a/ingest/rules/upload_to_s3.smk b/ingest/rules/upload_to_s3.smk
new file mode 100644
index 0000000..67dceb7
--- /dev/null
+++ b/ingest/rules/upload_to_s3.smk
@@ -0,0 +1,35 @@
+"""
+This part of the workflow handles uploading files to AWS S3.
+"""
+
+
+rule upload_sequences:
+    input:
+        sequences="{data_source}/results/sequences_{segment}.fasta",
+    output:
+        flag=touch("{data_source}/s3/sequences_{segment}.done"),
+    params:
+        s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source],
+    shell:
+        """
+        zstd -c {input.sequences:q} \
+            | aws s3 cp \
+                  - \
+                  {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst
+        """
+
+
+rule upload_metadata:
+    input:
+        metadata="{data_source}/results/metadata.tsv",
+    output:
+        flag=touch("{data_source}/s3/metadata.done"),
+    params:
+        s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source],
+    shell:
+        """
+        zstd -c {input.metadata:q} \
+            | aws s3 cp \
+                  - \
+                  {params.s3_dst:q}/metadata.tsv.zst
+        """

From f5b610c3f5f181ca1d0a1dd1d11a30c824be013d Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 15:17:55 -0700
Subject: [PATCH 3/9] ingest: Rename fauna rule file to reflect reality

---
 ingest/Snakefile                                         | 2 +-
 ingest/rules/{upload_from_fauna.smk => ingest_fauna.smk} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename ingest/rules/{upload_from_fauna.smk => ingest_fauna.smk} (100%)

diff --git a/ingest/Snakefile b/ingest/Snakefile
index 3d04b62..1e3cc0d 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -21,7 +21,7 @@ rule upload_all:
         sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]),
         metadata="fauna/s3/metadata.done",
 
-include: "rules/upload_from_fauna.smk"
+include: "rules/ingest_fauna.smk"
 include: "rules/ingest_andersen_lab.smk"
 include: "rules/merge_segment_metadata.smk"
 include: "rules/upload_to_s3.smk"
diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/ingest_fauna.smk
similarity index 100%
rename from ingest/rules/upload_from_fauna.smk
rename to ingest/rules/ingest_fauna.smk

From 04b6a95eb34cc3f7c11ae4beb0dc2eeac8043c6f Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 16:08:50 -0700
Subject: [PATCH 4/9] ingest: Move Andersen Lab under build-configs/ncbi/

We are planning to run this in parallel with the NCBI ingest and
eventually merge their data. They will also share more config
params with each other than with the default fauna ingest.

Andersen Lab is using NCBI SRA data, so I think it makes sense to be
under the NCBI umbrella.
---
 ingest/README.md                                       |  2 +-
 ingest/Snakefile                                       |  1 -
 ingest/build-configs/ncbi/Snakefile                    | 10 ++++++++++
 .../ncbi/bin/curate_andersen_lab_data}                 |  1 +
 ingest/build-configs/ncbi/defaults/config.yaml         |  1 +
 .../ncbi}/rules/ingest_andersen_lab.smk                |  2 +-
 ingest/defaults/config.yaml                            |  1 -
 7 files changed, 14 insertions(+), 4 deletions(-)
 rename ingest/{scripts/curate_andersen_lab_data.py => build-configs/ncbi/bin/curate_andersen_lab_data} (99%)
 mode change 100644 => 100755
 rename ingest/{ => build-configs/ncbi}/rules/ingest_andersen_lab.smk (98%)

diff --git a/ingest/README.md b/ingest/README.md
index a69d222..e85812b 100644
--- a/ingest/README.md
+++ b/ingest/README.md
@@ -60,7 +60,7 @@ Only run this workflow as needed to see the latest available data in the repo.
 It does not merge or deduplicate the data with the fauna data used in the default ingest workflow.
 
 ```sh
-nextstrain build . merge_andersen_segment_metadata
+nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/config.yaml
 ```
 
 The results will be available in `andersen-lab/results/`.
diff --git a/ingest/Snakefile b/ingest/Snakefile
index 1e3cc0d..eb3839c 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -22,7 +22,6 @@ rule upload_all:
         metadata="fauna/s3/metadata.done",
 
 include: "rules/ingest_fauna.smk"
-include: "rules/ingest_andersen_lab.smk"
 include: "rules/merge_segment_metadata.smk"
 include: "rules/upload_to_s3.smk"
 
diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile
index e3255d2..881a2e9 100644
--- a/ingest/build-configs/ncbi/Snakefile
+++ b/ingest/build-configs/ncbi/Snakefile
@@ -12,6 +12,16 @@ rule ingest_ncbi:
         ], segment=config["ncbi_segments"].keys()),
         "ncbi/results/metadata.tsv",
 
+
+rule ingest_andersen_lab:
+    input:
+        expand([
+            "andersen-lab/results/sequences_{segment}.fasta",
+        ], segment=config["segments"]),
+        "andersen-lab/results/metadata.tsv",
+
+
 # Include file paths are relative this Snakefile
+include: "rules/ingest_andersen_lab.smk"
 include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
diff --git a/ingest/scripts/curate_andersen_lab_data.py b/ingest/build-configs/ncbi/bin/curate_andersen_lab_data
old mode 100644
new mode 100755
similarity index 99%
rename from ingest/scripts/curate_andersen_lab_data.py
rename to ingest/build-configs/ncbi/bin/curate_andersen_lab_data
index a7b62d6..5cc636a
--- a/ingest/scripts/curate_andersen_lab_data.py
+++ b/ingest/build-configs/ncbi/bin/curate_andersen_lab_data
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 Curate the metadata that originated from Andersen Lab's avian-influenza repo
 <https://github.com/andersen-lab/avian-influenza>.
diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml
index 1049618..ad7ee87 100644
--- a/ingest/build-configs/ncbi/defaults/config.yaml
+++ b/ingest/build-configs/ncbi/defaults/config.yaml
@@ -129,3 +129,4 @@ curate:
 
 s3_dst:
   ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi
+  andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab
diff --git a/ingest/rules/ingest_andersen_lab.smk b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk
similarity index 98%
rename from ingest/rules/ingest_andersen_lab.smk
rename to ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk
index 51bbacd..25dfb51 100644
--- a/ingest/rules/ingest_andersen_lab.smk
+++ b/ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk
@@ -83,7 +83,7 @@ rule curate_metadata:
         """
         augur curate normalize-strings \
             --metadata {input.metadata} \
-            | python3 ./scripts/curate_andersen_lab_data.py \
+            | ./build-configs/ncbi/bin/curate_andersen_lab_data \
             | ./vendored/apply-geolocation-rules \
                 --geolocation-rules {input.geolocation_rules} \
             | augur curate passthru \
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index 1d2ebf5..cdbcfef 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -10,4 +10,3 @@ segments:
 
 s3_dst:
   fauna: s3://nextstrain-data-private/files/workflows/avian-flu
-  andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab

From d1ebb406810840c8925df0256a0bd33f956432d4 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 16:41:46 -0700
Subject: [PATCH 5/9] ingest/build-configs/ncbi: Use `segments` config param
 for default rule

Use the `segments` param to determine the default outputs for
`ingest_ncbi` to match the `merge_segment_metadata` rule.

Adds a sanity check that the requested segments are represented in
the `ncbi_segments` map.
---
 ingest/build-configs/ncbi/Snakefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile
index 881a2e9..14b209f 100644
--- a/ingest/build-configs/ncbi/Snakefile
+++ b/ingest/build-configs/ncbi/Snakefile
@@ -5,11 +5,15 @@ workflow and defines its default outputs.
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "build-configs/ncbi/defaults/config.yaml"
 
+# Sanity check that the requested segments match our ncbi_segments map
+assert all(segment in config["ncbi_segments"].keys() for segment in config["segments"])
+
+
 rule ingest_ncbi:
     input:
         expand([
             "ncbi/results/sequences_{segment}.fasta",
-        ], segment=config["ncbi_segments"].keys()),
+        ], segment=config["segments"]),
         "ncbi/results/metadata.tsv",
 
 

From 2c0cbcbb08c714f154dbd8af87ae5e53110aa1b4 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 16:54:01 -0700
Subject: [PATCH 6/9] ingest/build-configs/ncbi: Add target `upload_all_ncbi`

The new target `upload_all_ncbi` will run and upload all files for
the NCBI and Andersen lab ingests.

I didn't see a need to individual rules to upload each data source
(i.e. upload_ncbi and upload_andersen_lab) but they can be added in the
future as needed.
---
 ingest/build-configs/ncbi/Snakefile | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/ingest/build-configs/ncbi/Snakefile b/ingest/build-configs/ncbi/Snakefile
index 14b209f..1b2c62b 100644
--- a/ingest/build-configs/ncbi/Snakefile
+++ b/ingest/build-configs/ncbi/Snakefile
@@ -8,6 +8,7 @@ configfile: "build-configs/ncbi/defaults/config.yaml"
 # Sanity check that the requested segments match our ncbi_segments map
 assert all(segment in config["ncbi_segments"].keys() for segment in config["segments"])
 
+NCBI_DATA_SOURCES = ["ncbi", "andersen-lab"]
 
 rule ingest_ncbi:
     input:
@@ -25,6 +26,15 @@ rule ingest_andersen_lab:
         "andersen-lab/results/metadata.tsv",
 
 
+# Uploads all results for NCBI and Andersen Lab ingests
+rule upload_all_ncbi:
+    input:
+        expand([
+            "{data_source}/s3/sequences_{segment}.done",
+            "{data_source}/s3/metadata.done",
+        ], data_source=NCBI_DATA_SOURCES, segment=config["segments"]),
+
+
 # Include file paths are relative this Snakefile
 include: "rules/ingest_andersen_lab.smk"
 include: "rules/fetch_from_ncbi.smk"

From 1bfeb779bceb3305f4b94f7441f7562e3563c2d9 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 17:09:04 -0700
Subject: [PATCH 7/9] ingest/README: Update with instructions for public
 sources

---
 ingest/README.md | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/ingest/README.md b/ingest/README.md
index e85812b..5881825 100644
--- a/ingest/README.md
+++ b/ingest/README.md
@@ -12,14 +12,46 @@ This workflow requires the Nextstrain CLI's Docker runtime which includes [fauna
 > NOTE: All command examples assume you are within the `ingest` directory.
 > If running commands from the outer `avian-flu` directory, replace the `.` with `ingest`.
 
-### Ingest data from NCBI GenBank
+### Ingest and upload data from public sources to S3
+
+#### Ingest NCBI GenBank
 
 To download, parse and curate data from NCBI GenBank run the following command.
 ```sh
 nextstrain build . ingest_ncbi --configfile build-configs/ncbi/defaults/config.yaml
 ```
 
-This results in the files `metadata.tsv`, `sequences_ha.fasta`, etc... under `ingest/ncbi/results/`.
+This results in the files `metadata.tsv`, `sequences_ha.fasta`, etc... under `ncbi/results/`.
+
+#### Ingest from Andersen lab's avian-influenza repo
+
+Ingest publicly available consensus sequences and metadata from Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza).
+Only run this workflow as needed to see the latest available data in the repo.
+It does not merge or deduplicate the data the NCBI GenBank workflow.
+
+```sh
+nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/config.yaml
+```
+
+The results will be available in `andersen-lab/results/`.
+
+#### Upload to S3
+
+To run both NCBI Genbank and Andersent Lab ingests _and_ upload results to S3,
+run the following command:
+
+```sh
+nextstrain build \
+    --env AWS_ACCESS_KEY_ID \
+    --env AWS_SECRET_ACCESS_KEY \
+    . \
+        upload_all_ncbi \
+            --configfile build-configs/ncbi/defaults/config.yaml
+```
+
+The workflow compresses and uploads the local files to S3 to corresponding paths
+under `s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi` and
+`s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab`.
 
 ### Ingest and upload data from fauna to S3
 
@@ -53,17 +85,6 @@ nextstrain build \
     . upload_all
 ```
 
-### Ingest from Andersen lab's avian-influenza repo
-
-Ingest publicly available consensus sequences and metadata from Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza).
-Only run this workflow as needed to see the latest available data in the repo.
-It does not merge or deduplicate the data with the fauna data used in the default ingest workflow.
-
-```sh
-nextstrain build . ingest_andersen_lab --configfile build-configs/ncbi/defaults/config.yaml
-```
-
-The results will be available in `andersen-lab/results/`.
 
 ## Configuration
 

From 600da7725c1660c2f204c40a82261d2253a10dce Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 15:35:59 -0700
Subject: [PATCH 8/9] ingest: Switch to vendered/upload-to-s3

Replace the `aws s3 cp` commands with the vendored/upload-to-s3 script
to use a couple of its built-in features:
- CloudFront invalidation
- add Metadata.sha256sum to be able to track file changes
---
 .../build-configs/ncbi/defaults/config.yaml   |  2 ++
 ingest/rules/upload_to_s3.smk                 | 24 +++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/ingest/build-configs/ncbi/defaults/config.yaml b/ingest/build-configs/ncbi/defaults/config.yaml
index ad7ee87..af71ec9 100644
--- a/ingest/build-configs/ncbi/defaults/config.yaml
+++ b/ingest/build-configs/ncbi/defaults/config.yaml
@@ -130,3 +130,5 @@ curate:
 s3_dst:
   ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi
   andersen-lab: s3://nextstrain-data/files/workflows/avian-flu/h5n1/andersen-lab
+
+cloudfront_domain: data.nextstrain.org
diff --git a/ingest/rules/upload_to_s3.smk b/ingest/rules/upload_to_s3.smk
index 67dceb7..5fb123c 100644
--- a/ingest/rules/upload_to_s3.smk
+++ b/ingest/rules/upload_to_s3.smk
@@ -7,15 +7,17 @@ rule upload_sequences:
     input:
         sequences="{data_source}/results/sequences_{segment}.fasta",
     output:
-        flag=touch("{data_source}/s3/sequences_{segment}.done"),
+        flag="{data_source}/s3/sequences_{segment}.done",
     params:
         s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source],
+        cloudfront_domain=config.get("cloudfront_domain", ""),
     shell:
         """
-        zstd -c {input.sequences:q} \
-            | aws s3 cp \
-                  - \
-                  {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst
+        ./vendored/upload-to-s3 \
+            --quiet \
+            {input.sequences:q} \
+            {params.s3_dst:q}/{wildcards.segment}/sequences.fasta.zst \
+            {params.cloudfront_domain} 2>&1 | tee {output.flag}
         """
 
 
@@ -23,13 +25,15 @@ rule upload_metadata:
     input:
         metadata="{data_source}/results/metadata.tsv",
     output:
-        flag=touch("{data_source}/s3/metadata.done"),
+        flag="{data_source}/s3/metadata.done",
     params:
         s3_dst=lambda wildcards: config["s3_dst"][wildcards.data_source],
+        cloudfront_domain=config.get("cloudfront_domain", ""),
     shell:
         """
-        zstd -c {input.metadata:q} \
-            | aws s3 cp \
-                  - \
-                  {params.s3_dst:q}/metadata.tsv.zst
+        ./vendored/upload-to-s3 \
+            --quiet \
+            {input.metadata:q} \
+            {params.s3_dst:q}/metadata.tsv.zst \
+            {params.cloudfront_domain} 2>&1 | tee {output.flag}
         """

From e86aa9731dd707a34c8581c446579c8623ce75ce Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 31 May 2024 17:40:15 -0700
Subject: [PATCH 9/9] Update Snakemake.genome build to accept S3_SRC for public
 data

---
 README.md        | 9 +++++++--
 Snakefile.genome | 8 ++++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ad756e4..63f0ebf 100755
--- a/README.md
+++ b/README.md
@@ -69,10 +69,15 @@ Specifically, the files needed are `ingest/results/metadata.tsv` and `ingest/res
 Run full genome builds with the following command.
 
 ``` bash
-nextstrain build . --snakefile Snakefile.genome --config local_ingest=True ingest_source=ncbi
+nextstrain build \
+    --env AWS_ACCESS_KEY_ID \
+    --env AWS_SECRET_ACCESS_KEY \
+    . \
+        --snakefile Snakefile.genome \
+        --config s3_src=s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi
 ```
 
-Currently this is only set up for the "h5n1-cattle-outbreak" build using locally ingested NCBI data,
+Currently this is only set up for the "h5n1-cattle-outbreak" build using NCBI data,
 and the build is restricted to a set of strains where we think there's no reassortment, with outgroups
 excluded in (`config/dropped_strains_h5n1-cattle-outbreak.txt`).
 Output files will be placed in `results/h5n1-cattle-outbreak/genome`.
diff --git a/Snakefile.genome b/Snakefile.genome
index 97de351..bcd02d9 100644
--- a/Snakefile.genome
+++ b/Snakefile.genome
@@ -1,7 +1,11 @@
 include: "rules/common.smk"
 
-assert LOCAL_INGEST == True and INGEST_SOURCE == "ncbi", \
-    "Full genome build is only set up for local ingest from 'ncbi'."
+if LOCAL_INGEST:
+    assert INGEST_SOURCE == "ncbi", \
+        "Full genome build is only set up for locat ingest from 'ncbi'."
+else:
+    assert S3_SRC.startswith("s3://nextstrain-data/"), \
+        "Full genome build is only set up for data from the public S3 bucket"
 
 import json