ingest: simplify NCBI Datasets fields config

Instead of hard-coding the list of NCBI Datasets fields in the workflow, just provide the list via the default config. This makes it easy to customize which fields to include and makes it very obvious that field_map config for the the curation pipeline is changing the names of these NCBI fields. This includes a change in the `format_ncbi_dataset_report` rule to use the provided fields as the header so that we do not have to do a separate renaming of the NCBI column names back to the computer friendly mneumonics.
nextstrain · Nov 28, 2023 · d9751bb · d9751bb
1 parent 5ac694b
commit d9751bb
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 59 deletions.
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -10,8 +10,27 @@ entrez_search_term: ""
 # Required to fetch from NCBI Datasets
 ncbi_taxon_id: ""
 
-# Optional fields to add to the NCBI Datasets output
-ncbi_dataset_fields: []
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - sra-accs
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - biosample-acc
+  - submitter-names
+  - submitter-affiliation
+  - submitter-country
 
 # Config parameters related to the curate pipeline
 curate:
@@ -23,26 +42,25 @@ curate:
   # The path should be relative to the ingest directory.
   local_geolocation_rules: "config/geolocation_rules.tsv"
   # List of field names to change where the key is the original field name and the value is the new field name
-  # This is the first step in the pipeline, so any references to field names
-  # in the configs below should use the new field names
-  # The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names.
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
-    Source database: database
-    Isolate Collection date: date
-    Release date: date_released
-    Update date: date_updated
-    Accession: accession
-    Isolate Lineage: strain
-    Geographic Region: region
-    Geographic Location: location
-    Submitter Names: authors
-    Submitter Affiliation: institution
-    SRA Accessions: sra_accessions
-    Length: length
-    Host Name: host
-    Isolate Lineage source: sample_type
-    BioSample accession: biosample_accession
-    Submitter Country: submitter_country
+    accession: accession
+    sourcedb: database
+    sra-accs: sra_accessions
+    isolate-lineage: strain
+    geo-region: region
+    geo-location: location
+    isolate-collection-date: date
+    release-date: date_released
+    update-date: date_updated
+    length: length
+    host-name: host
+    isolate-lineage-source: sample_type
+    biosample-acc: biosample_accessions
+    submitter-names: authors
+    submitter-affiliation: institution
+    submitter-country: submitter_country
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: '^.+$'

diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
@@ -67,55 +67,22 @@ rule extract_ncbi_dataset_sequences:
         """
 
 
-def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str:
-    """
-    Return list of NCBI Dataset report field mnemonics for fields that we want
-    to parse out of the dataset report. The column names in the output TSV
-    are different from the mnemonics.
-
-    Additional *provided_fields* will be appended to the end of the list.
-
-    See NCBI Dataset docs for full list of available fields and their column
-    names in the output:
-    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-    """
-    fields = [
-        "accession",
-        "sourcedb",
-        "sra-accs",
-        "isolate-lineage",
-        "geo-region",
-        "geo-location",
-        "isolate-collection-date",
-        "release-date",
-        "update-date",
-        "length",
-        "host-name",
-        "isolate-lineage-source",
-        "biosample-acc",
-        "submitter-names",
-        "submitter-affiliation",
-        "submitter-country",
-    ]
-    return ",".join(fields + provided_fields)
-
-
 rule format_ncbi_dataset_report:
     input:
         dataset_package="data/ncbi_dataset.zip",
     output:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
-        fields_to_include=_get_ncbi_dataset_field_mnemonics(
-            config.get("ncbi_dataset_fields", [])
-        ),
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
     shell:
         """
         dataformat tsv virus-genome \
             --package {input.dataset_package} \
-            --fields {params.fields_to_include:q} \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -139,7 +106,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column Accession \
+            --seq-id-column accession \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \