Add a parameter to augur parse to specify a different record ID for o…

…utput sequences FASTA #1403
nextstrain · Feb 12, 2024 · dd8a1cb · dd8a1cb
2 parents 499f0e9 + 9d96cad
commit dd8a1cb
Show file tree

Hide file tree

Showing 5 changed files with 2,245 additions and 7 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,6 +6,9 @@
 
 * filter: Added a new option `--query-columns` that allows specifying what columns are used in `--query` along with the expected data types. If unspecified, automatic detection of columns and types is attempted. [#1294][] (@victorlin)
 * `augur.io.read_metadata`: A new optional `columns` argument allows specifying a subset of columns to load. The default behavior still loads all columns, so this is not a breaking change. [#1294][] (@victorlin)
+* `augur parse`: A new optional `--output-id-field` argument allows the user to select any ID field for the produced FASTA file (e.g. 'accession' instead of 'name' or 'strain'). [#1403][] (@j23414)
+  * When no `--output-id-field` is given and the data has both `name` and `strain` fields, continue to preferentially use `name` over `strain` as the sequence ID field; but, throw a deprecation warning that the order will be switched to prefer `strain` over `name` in the future to be consistent with the rest of Augur.
+  * Added entry to [DEPRECATED.md](./DEPRECATED.md).
 
 ### Bug Fixes
 
@@ -20,6 +23,7 @@
 [#1294]: https://github.com/nextstrain/augur/pull/1294
 [#1389]: https://github.com/nextstrain/augur/pull/1389
 [#1410]: https://github.com/nextstrain/augur/pull/1410
+[#1403]: https://github.com/nextstrain/augur/pull/1403
 
 ## 24.1.0 (30 January 2024)
 

diff --git a/DEPRECATED.md b/DEPRECATED.md
@@ -4,6 +4,14 @@ These features are deprecated, which means they are no longer maintained and
 will go away in a future major version of Augur. They are currently still
 available for backwards compatibility, but should not be used in new code.
 
+## `augur parse` preference of `name` over `strain` as the sequence ID field
+
+*Deprecated in February 2024. Planned to be reordered June 2024 or after.*
+
+Currently, `augur parse` checks for a 'name' field and then a 'strain' field to use as a sequence ID. This order will be changed in favor of searching for a 'strain' and then a 'name' field to be more consistent with the rest of Augur.
+
+Users who have both 'name' and 'strain' fields in their data, and want to favor using the 'name' field should add the following `augur parse` parameter `--output-id-field 'name'`.
+
 ## `augur.utils.read_strains`
 
 *Deprecated in December 2023. Planned for removal March 2024 or after.*

diff --git a/augur/parse.py b/augur/parse.py
@@ -9,6 +9,8 @@
 from .dates import get_numerical_date_from_value
 from .errors import AugurError
 
+PARSE_DEFAULT_ID_COLUMNS = ("name", "strain")
+
 forbidden_characters = str.maketrans(
     {' ': None,
      '(': '_',
@@ -133,8 +135,6 @@ def parse_sequence(sequence, fields, strain_key="strain", separator="|", prettif
             dayfirst=fix_dates_format=='dayfirst'
         )
 
-    metadata["strain"] = sequence.id
-
     return sequence, metadata
 
 
@@ -143,6 +143,8 @@ def register_parser(parent_subparsers):
     parser.add_argument('--sequences', '-s', required=True, help="sequences in fasta or VCF format")
     parser.add_argument('--output-sequences', required=True, help="output sequences file")
     parser.add_argument('--output-metadata', required=True, help="output metadata file")
+    parser.add_argument('--output-id-field', required=False,
+                        help=f"The record field to use as the sequence identifier in the FASTA output. If not provided, this will use the first available of {PARSE_DEFAULT_ID_COLUMNS}. If none of those are available, this will use the first field in the fasta header.")
     parser.add_argument('--fields', required=True, nargs='+', help="fields in fasta header")
     parser.add_argument('--prettify-fields', nargs='+', help="apply string prettifying operations (underscores to spaces, capitalization, etc) to specified metadata fields")
     parser.add_argument('--separator', default='|', help="separator of fasta header")
@@ -162,12 +164,20 @@ def run(args):
     # field to index the dictionary and the data frame
     meta_data = {}
 
-    if 'name' in args.fields:
-        strain_key = 'name'
-    elif 'strain' in args.fields:
-        strain_key = 'strain'
+    strain_key = None
+    if args.output_id_field:
+        if args.output_id_field not in args.fields:
+            raise AugurError(f"Output id field '{args.output_id_field}' not found in fields {args.fields}.")
+        strain_key = args.output_id_field
     else:
-        strain_key = args.fields[0]
+        for possible_id in PARSE_DEFAULT_ID_COLUMNS:
+            if possible_id in args.fields:
+                strain_key = possible_id
+                if possible_id == "name" and "strain" in args.fields:
+                    print("DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').\nUsers who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'", file=sys.stderr)
+                break
+        if not strain_key:
+            strain_key = args.fields[0]
 
     # loop over sequences, parse fasta header of each sequence
     with open_file(args.output_sequences, "wt") as handle:

diff --git a/tests/functional/parse.t b/tests/functional/parse.t
@@ -15,6 +15,7 @@ This should fail.
   .* (re)
   .* (re)
   .* (re)
+  .* (re)
   augur parse: error: the following arguments are required: --fields
   [2]
 
@@ -32,6 +33,83 @@ Parse Zika sequences into sequences and metadata.
   $ diff -u "parse/metadata.tsv" "$TMP/metadata.tsv"
   $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv"
 
+Parse Zika sequences into sequences and metadata using a different metadata field as record id (e.g. accession)
+
+  $ ${AUGUR} parse \
+  >   --sequences parse/zika.fasta \
+  >   --output-sequences "$TMP/sequences.fasta" \
+  >   --output-metadata "$TMP/metadata.tsv" \
+  >   --output-id-field accession \
+  >   --fields strain virus accession date region country division city db segment authors url title journal paper_url \
+  >   --prettify-fields region country division city \
+  >   --fix-dates monthfirst
+
+  $ diff -u "parse/sequences_other.fasta" "$TMP/sequences.fasta"
+  $ diff -u "parse/metadata.tsv" "$TMP/metadata.tsv"
+  $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv"
+
+Try to parse Zika sequences with a misspelled field.
+This should fail.
+
+  $ ${AUGUR} parse \
+  >   --sequences parse/zika.fasta \
+  >   --output-sequences "$TMP/sequences.fasta" \
+  >   --output-metadata "$TMP/metadata.tsv" \
+  >   --output-id-field notexist \
+  >   --fields strain virus accession date region country division city db segment authors url title journal paper_url \
+  >   --prettify-fields region country division city \
+  >   --fix-dates monthfirst
+  ERROR: Output id field 'notexist' not found in fields ['strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url'].
+  [2]
+
+Parse Zika sequences into sequences and metadata, preferred default ids is 'name', then 'strain', then first field.
+
+  $ ${AUGUR} parse \
+  >   --sequences parse/zika.fasta \
+  >   --output-sequences "$TMP/sequences.fasta" \
+  >   --output-metadata "$TMP/metadata.tsv" \
+  >   --fields strain virus name date region country division city db segment authors url title journal paper_url \
+  >   --prettify-fields region country division city \
+  >   --fix-dates monthfirst
+  DEPRECATED: The default search order for the ID field will be changing from ('name', 'strain') to ('strain', 'name').
+  Users who prefer to keep using 'name' instead of 'strain' should use the parameter: --output-id-field 'name'
+
+  $ diff -u "parse/sequences_other.fasta" "$TMP/sequences.fasta"
+  $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv"
+
+Parse Zika sequences into sequences and metadata when there is no 'name' field.
+This should use the 2nd entry in DEFAULT_ID_COLUMNS ('name', 'strain') instead.
+
+  $ ${AUGUR} parse \
+  >   --sequences parse/zika.fasta \
+  >   --output-sequences "$TMP/sequences.fasta" \
+  >   --output-metadata "$TMP/metadata.tsv" \
+  >   --fields col1 virus strain date region country division city db segment authors url title journal paper_url \
+  >   --prettify-fields region country division city \
+  >   --fix-dates monthfirst
+
+  $ diff -u "parse/sequences_other.fasta" "$TMP/sequences.fasta"
+  $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv"
+
+Parse Zika sequences into sequences and metadata when no output-id-field is provided and none of the fields match DEFAULT_ID_COLUMNS (e.g. ('strain', 'name')).
+This should use the first field as the id field and the metadata should not have an extra strain or name column.
+
+  $ ${AUGUR} parse \
+  >   --sequences parse/zika.fasta \
+  >   --output-sequences "$TMP/sequences.fasta" \
+  >   --output-metadata "$TMP/metadata.tsv" \
+  >   --fields col1 virus col3 date region country division city db segment authors url title journal paper_url \
+  >   --prettify-fields region country division city \
+  >   --fix-dates monthfirst
+
+  $ diff -u "parse/sequences.fasta" "$TMP/sequences.fasta"
+  $ diff "parse/metadata.tsv" "$TMP/metadata.tsv" | tr '>' '+' | tr '<' '-'
+  1c1
+  - strain\tvirus\taccession\tdate\tregion\tcountry\tdivision\tcity\tdb\tsegment\tauthors\turl\ttitle\tjournal\tpaper_url (esc)
+  ---
+  + col1\tvirus\tcol3\tdate\tregion\tcountry\tdivision\tcity\tdb\tsegment\tauthors\turl\ttitle\tjournal\tpaper_url (esc)
+  $ rm -f "$TMP/sequences.fasta" "$TMP/metadata.tsv"
+
 Parse compressed Zika sequences into sequences and metadata.
 
   $ ${AUGUR} parse \