Merge pull request #1146 from nextstrain/curate-format-dates

Curate format dates
nextstrain · Jul 7, 2023 · b03442c · b03442c
2 parents edf06e6 + b4d390d
commit b03442c
Show file tree

Hide file tree

Showing 15 changed files with 443 additions and 81 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,12 +5,14 @@
 ### Features
 
 * export, frequencies, refine, traits: Add a new flag `--metadata-id-columns` to customize the possible metadata ID columns. Previously, this was only available in `augur filter`. [#1240][] (@victorlin)
+* Add new sub-subcommand augur curate format-dates. The format-dates command is intended to be used to format date fields to ISO 8601 date format (YYYY-MM-DD), where incomplete dates are masked with `XX` (e.g. 2023 -> 2023-XX-XX). [#1146][] (@joverlee521)
 
 ### Bug fixes
 
 * parse: Fix a bug where `--fix-dates` was always applied, with a default of `--fix-dates=monthfirst`. Now, running without `--fix-dates` will leave dates as-is. [#1247][] (@victorlin)
 * `augur.io.open_file`: Previously, the docs described a type restriction on `path_or_buffer` but it was not enforced. It has been updated to allow all I/O classes, and is enforced at run-time. [#1250][] (@victorlin)
 
+[#1146]: https://github.com/nextstrain/augur/pull/1146
 [#1240]: https://github.com/nextstrain/augur/pull/1240
 [#1247]: https://github.com/nextstrain/augur/issues/1247
 [#1250]: https://github.com/nextstrain/augur/pull/1250

diff --git a/augur/argparse_.py b/augur/argparse_.py
@@ -4,6 +4,20 @@
 from argparse import Action, ArgumentDefaultsHelpFormatter
 
 
+# Include this in an argument help string to suppress the automatic appending
+# of the default value by argparse.ArgumentDefaultsHelpFormatter.  This works
+# because the automatic appending is conditional on the presence of %(default),
+# so we include it but then format it as a zero-length string .0s.  🙃
+#
+# Another solution would be to add an extra attribute to the argument (the
+# argparse.Action instance) and then subclass ArgumentDefaultsHelpFormatter to
+# condition on that new attribute, but that seems more brittle.
+#
+# Copied from the Nextstrain CLI repo
+# https://github.com/nextstrain/cli/blob/017c53805e8317951327d24c04184615cc400b09/nextstrain/cli/argparse.py#L13-L21
+SKIP_AUTO_DEFAULT_IN_HELP = "%(default).0s"
+
+
 def add_default_command(parser):
     """
     Sets the default command to run when none is provided.

diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
@@ -12,13 +12,14 @@
 from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
 from augur.io.sequences import write_records_to_fasta
 from augur.types import DataErrorMethod
-from . import normalize_strings, passthru
+from . import format_dates, normalize_strings, passthru
 
 
 SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
 SUBCOMMANDS = [
     passthru,
     normalize_strings,
+    format_dates,
 ]
 
 
@@ -63,12 +64,14 @@ def create_shared_parser():
         help="The name to use for the sequence field when joining sequences from a FASTA file.")
 
     shared_inputs.add_argument("--unmatched-reporting",
-        choices=[ method.value for method in DataErrorMethod ],
-        default=DataErrorMethod.ERROR_FIRST.value,
+        type=DataErrorMethod.argtype,
+        choices=list(DataErrorMethod),
+        default=DataErrorMethod.ERROR_FIRST,
         help="How unmatched records from combined metadata/FASTA input should be reported.")
     shared_inputs.add_argument("--duplicate-reporting",
-        choices=[ method.value for method in DataErrorMethod ],
-        default=DataErrorMethod.ERROR_FIRST.value,
+        type=DataErrorMethod.argtype,
+        choices=list(DataErrorMethod),
+        default=DataErrorMethod.ERROR_FIRST,
         help="How should duplicate records be reported.")
 
     shared_outputs = shared_parser.add_argument_group(
@@ -142,8 +145,8 @@ def run(args):
                 args.fasta,
                 args.seq_id_column,
                 args.seq_field,
-                DataErrorMethod(args.unmatched_reporting),
-                DataErrorMethod(args.duplicate_reporting))
+                args.unmatched_reporting,
+                args.duplicate_reporting)
         except InvalidDelimiter:
             raise AugurError(
                 f"Could not determine the delimiter of {args.metadata!r}. "
@@ -152,7 +155,7 @@ def run(args):
             )
     elif args.metadata:
         try:
-            records = read_table_to_dict(args.metadata, args.metadata_delimiters, DataErrorMethod(args.duplicate_reporting), args.id_column)
+            records = read_table_to_dict(args.metadata, args.metadata_delimiters, args.duplicate_reporting, args.id_column)
         except InvalidDelimiter:
             raise AugurError(
                 f"Could not determine the delimiter of {args.metadata!r}. "

diff --git a/augur/curate/format_dates.py b/augur/curate/format_dates.py
@@ -0,0 +1,211 @@
+"""
+Format date fields to ISO 8601 dates (YYYY-MM-DD), where incomplete dates
+are masked with 'XX' (e.g. 2023 -> 2023-XX-XX).
+"""
+import re
+from datetime import datetime
+
+from augur.argparse_ import SKIP_AUTO_DEFAULT_IN_HELP
+from augur.errors import AugurError
+from augur.io.print import print_err
+from augur.types import DataErrorMethod
+from .format_dates_directives import YEAR_DIRECTIVES, YEAR_MONTH_DIRECTIVES, YEAR_MONTH_DAY_DIRECTIVES
+
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("format-dates",
+        parents=[parent_subparsers.shared_parser],
+        help=__doc__)
+
+    required = parser.add_argument_group(title="REQUIRED")
+    required.add_argument("--date-fields", nargs="+",
+        help="List of date field names in the record that need to be standardized.")
+    required.add_argument("--expected-date-formats", nargs="+",
+        help="Expected date formats that are currently in the provided date fields, " +
+             "defined by standard format codes as listed at " +
+             "https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes. " +
+             "If a date string matches multiple formats, it will be parsed as the first matched format in the provided order.")
+
+    optional = parser.add_argument_group(title="OPTIONAL")
+    optional.add_argument("--failure-reporting",
+        type=DataErrorMethod.argtype,
+        choices=list(DataErrorMethod),
+        default=DataErrorMethod.ERROR_FIRST,
+        help="How should failed date formatting be reported.")
+    optional.add_argument("--no-mask-failure", dest="mask_failure",
+        action="store_false",
+        help="Do not mask dates with 'XXXX-XX-XX' and return original date string if date formatting failed. " +
+             f"(default: False{SKIP_AUTO_DEFAULT_IN_HELP})")
+
+    return parser
+
+
+def directive_is_included(potential_directives, date_format):
+    """
+    Checks if any of the directives in *potential_directives* is included
+    in *date_format* string.
+
+    If an element within *potential_directives* is a tuple, then all directives
+    within the tuple must be included in *date_format*.
+
+    Parameters
+    ----------
+    potential_directives: set[tuple[str, ...]]
+        Set of potential directives to check
+    date_format: str
+        Date format string to check for directives
+
+    Returns
+    -------
+    bool:
+        Whether the provided *date_format* includes any of the *potential_directives*
+
+
+    >>> potential_directives = {('%y', '%b', '%d'), ('%y', '%B', '%d'), ('%y', '%m', '%d'),}
+    >>> directive_is_included(potential_directives, '%G-%V-%A')
+    False
+    >>> directive_is_included(potential_directives, '%y-%m')
+    False
+    >>> directive_is_included(potential_directives, '%%y-%m-%d')
+    False
+    >>> directive_is_included(potential_directives, '%y-%m-%d')
+    True
+    >>> directive_is_included(potential_directives, '%y-%m-%dT%H:%M:%SZ')
+    True
+    """
+    return any(
+        all(
+            # Exclude escaped directives (e.g. '%%Y' means literal '%Y' not a four digit year)
+            bool(re.search(f"(?<!%){re.escape(sub_directive)}", date_format))
+            for sub_directive in directive
+        )
+        for directive in potential_directives
+    )
+
+
+def format_date(date_string, expected_formats):
+    """
+    Format *date_string* to ISO 8601 date (YYYY-MM-DD) by trying to parse it
+    as one of the provided *expected_formats*.
+
+    Parameters
+    ----------
+    date_string: str
+        Date string to format
+    expected_formats: list[str]
+        List of expected formats for the provided date string
+
+    Returns
+    -------
+    str or None:
+        Formatted date string or None if the parsing of the date string failed.
+        If *date_string* is an incomplete date, the date is masked with 'XX'.
+        Dates without year will be formatted as 'XXXX-XX-XX', even if month/day are known.
+        Dates without month will be formatted as 'YYYY-XX-XX', even if day is known.
+        Dates without day will be formatted as 'YYYY-MM-XX'.
+
+
+    >>> expected_formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d']
+    >>> format_date("01-01", expected_formats)
+    'XXXX-XX-XX'
+    >>> format_date("2020", expected_formats)
+    '2020-XX-XX'
+    >>> format_date("2020-01", expected_formats)
+    '2020-01-XX'
+    >>> format_date("2020-1-15", expected_formats)
+    '2020-01-15'
+    >>> format_date("2020-1-1", expected_formats)
+    '2020-01-01'
+    >>> format_date("2020-01-15", expected_formats)
+    '2020-01-15'
+    >>> format_date("2020-01-15T00:00:00Z", expected_formats)
+    '2020-01-15'
+    """
+
+    for date_format in expected_formats:
+        try:
+            parsed_date = datetime.strptime(date_string, date_format)
+        except ValueError:
+            continue
+
+        # Default to date masked as 'XXXX-XX-XX' so we don't return incorrect dates
+        year_string = 'XXXX'
+        month_string = day_string = 'XX'
+
+        parsed_year_string = str(parsed_date.year)
+        parsed_month_string = str(parsed_date.month).zfill(2)
+        parsed_day_string = str(parsed_date.day).zfill(2)
+
+        # If directives for all year,month,day fields are included in date_format,
+        # then use all of the parsed field strings
+        if directive_is_included(YEAR_MONTH_DAY_DIRECTIVES, date_format):
+            year_string = parsed_year_string
+            month_string = parsed_month_string
+            day_string = parsed_day_string
+
+        # If directives only include year and month are included in date_format,
+        # then only use the parsed year and month field strings
+        elif directive_is_included(YEAR_MONTH_DIRECTIVES, date_format):
+            year_string = parsed_year_string
+            month_string = parsed_month_string
+
+        # If directives only include year in date_format, the only use the
+        # parsed year field string
+        elif directive_is_included(YEAR_DIRECTIVES, date_format):
+            year_string = parsed_year_string
+
+        return f"{year_string}-{month_string}-{day_string}"
+
+    return None
+
+
+def run(args, records):
+    failures = []
+    failure_reporting = args.failure_reporting
+    for index, record in enumerate(records):
+        record = record.copy()
+        record_id = index
+
+        for field in args.date_fields:
+            date_string = record.get(field)
+
+            if not date_string:
+                continue
+
+            formatted_date_string = format_date(date_string, args.expected_date_formats)
+            if formatted_date_string is None:
+                # Mask failed date formatting before processing error methods
+                # to ensure failures are masked even when failures are "silent"
+                if args.mask_failure:
+                    record[field] = "XXXX-XX-XX"
+
+                if failure_reporting is DataErrorMethod.SILENT:
+                    continue
+
+                failure_message = f"Unable to format date string {date_string!r} in field {field!r} of record {record_id!r}."
+                if failure_reporting is DataErrorMethod.ERROR_FIRST:
+                    raise AugurError(failure_message)
+
+                if failure_reporting is DataErrorMethod.WARN:
+                    print_err(f"WARNING: {failure_message}")
+
+                # Keep track of failures for final summary
+                failures.append((record_id, field, date_string))
+            else:
+                record[field] = formatted_date_string
+
+        yield record
+
+    if failure_reporting is not DataErrorMethod.SILENT and failures:
+        failure_message = (
+            "Unable to format dates for the following (record, field, date string):\n" + \
+            '\n'.join(map(repr, failures))
+        )
+        if failure_reporting is DataErrorMethod.ERROR_ALL:
+            raise AugurError(failure_message)
+
+        elif failure_reporting is DataErrorMethod.WARN:
+            print_err(f"WARNING: {failure_message}")
+
+        else:
+            raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}")
diff --git a/augur/curate/format_dates_directives.py b/augur/curate/format_dates_directives.py
@@ -0,0 +1,28 @@
+from itertools import product
+
+year = {'%y', '%Y'}
+month = {'%b', '%B', '%m'}
+day = {'%d'}
+month_and_day = {'%j'}
+week = {'%U', '%W'}
+day_of_week = {'%A', '%a', '%w', '%u'}
+
+# Set of directives that can be converted to complete date with year, month, and day
+YEAR_MONTH_DAY_DIRECTIVES = (
+    # Locale's full date representation
+    {('%c',),('%x',)} |
+    # Dates with ISO 8601 week dates for year ('%G' is NOT interchangeable with '%Y'), ISO 8601 week ('%V'), and weekdays
+    {('%G', '%V', '%A'),('%G', '%V', '%a'),('%G', '%V', '%w'),('%G', '%V', '%u')} |
+    # Dates with year, week, and weekday
+    set(product(year, week, day_of_week)) |
+    # Dates with year and day of the year
+    set(product(year, month_and_day)) |
+    # Dates with year, month, and day
+    set(product(year, month, day))
+)
+
+# Set of directives that can be converted to incomplete dates, missing the day
+YEAR_MONTH_DIRECTIVES = set(product(year, month))
+
+# Set of directives that can be converted to incomplete dates, missing the month and day
+YEAR_DIRECTIVES = set(product(year))
diff --git a/docs/usage/cli/curate/format-dates.rst b/docs/usage/cli/curate/format-dates.rst
@@ -0,0 +1,9 @@
+============
+format-dates
+============
+
+.. argparse::
+    :module: augur
+    :func: make_parser
+    :prog: augur
+    :path: curate format-dates
diff --git a/docs/usage/cli/curate/index.rst b/docs/usage/cli/curate/index.rst
@@ -17,5 +17,6 @@ We will continue to add more subcommands as we identify other common data curati
     :maxdepth: 1
 
     normalize-strings
+    format-dates
     passthru
 
diff --git a/tests/functional/curate/cram/_setup.sh b/tests/functional/curate/cram/_setup.sh
@@ -0,0 +1 @@
+export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"