Skip to content

Commit

Permalink
Merge pull request #1146 from nextstrain/curate-format-dates
Browse files Browse the repository at this point in the history
Curate format dates
  • Loading branch information
joverlee521 committed Jul 7, 2023
2 parents edf06e6 + b4d390d commit b03442c
Show file tree
Hide file tree
Showing 15 changed files with 443 additions and 81 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
### Features

* export, frequencies, refine, traits: Add a new flag `--metadata-id-columns` to customize the possible metadata ID columns. Previously, this was only available in `augur filter`. [#1240][] (@victorlin)
* Add new sub-subcommand augur curate format-dates. The format-dates command is intended to be used to format date fields to ISO 8601 date format (YYYY-MM-DD), where incomplete dates are masked with `XX` (e.g. 2023 -> 2023-XX-XX). [#1146][] (@joverlee521)

### Bug fixes

* parse: Fix a bug where `--fix-dates` was always applied, with a default of `--fix-dates=monthfirst`. Now, running without `--fix-dates` will leave dates as-is. [#1247][] (@victorlin)
* `augur.io.open_file`: Previously, the docs described a type restriction on `path_or_buffer` but it was not enforced. It has been updated to allow all I/O classes, and is enforced at run-time. [#1250][] (@victorlin)

[#1146]: https://github.com/nextstrain/augur/pull/1146
[#1240]: https://github.com/nextstrain/augur/pull/1240
[#1247]: https://github.com/nextstrain/augur/issues/1247
[#1250]: https://github.com/nextstrain/augur/pull/1250
Expand Down
14 changes: 14 additions & 0 deletions augur/argparse_.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@
from argparse import Action, ArgumentDefaultsHelpFormatter


# Include this in an argument help string to suppress the automatic appending
# of the default value by argparse.ArgumentDefaultsHelpFormatter. This works
# because the automatic appending is conditional on the presence of %(default),
# so we include it but then format it as a zero-length string .0s. 🙃
#
# Another solution would be to add an extra attribute to the argument (the
# argparse.Action instance) and then subclass ArgumentDefaultsHelpFormatter to
# condition on that new attribute, but that seems more brittle.
#
# Copied from the Nextstrain CLI repo
# https://github.com/nextstrain/cli/blob/017c53805e8317951327d24c04184615cc400b09/nextstrain/cli/argparse.py#L13-L21
SKIP_AUTO_DEFAULT_IN_HELP = "%(default).0s"


def add_default_command(parser):
"""
Sets the default command to run when none is provided.
Expand Down
19 changes: 11 additions & 8 deletions augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
from augur.io.sequences import write_records_to_fasta
from augur.types import DataErrorMethod
from . import normalize_strings, passthru
from . import format_dates, normalize_strings, passthru


SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
SUBCOMMANDS = [
passthru,
normalize_strings,
format_dates,
]


Expand Down Expand Up @@ -63,12 +64,14 @@ def create_shared_parser():
help="The name to use for the sequence field when joining sequences from a FASTA file.")

shared_inputs.add_argument("--unmatched-reporting",
choices=[ method.value for method in DataErrorMethod ],
default=DataErrorMethod.ERROR_FIRST.value,
type=DataErrorMethod.argtype,
choices=list(DataErrorMethod),
default=DataErrorMethod.ERROR_FIRST,
help="How unmatched records from combined metadata/FASTA input should be reported.")
shared_inputs.add_argument("--duplicate-reporting",
choices=[ method.value for method in DataErrorMethod ],
default=DataErrorMethod.ERROR_FIRST.value,
type=DataErrorMethod.argtype,
choices=list(DataErrorMethod),
default=DataErrorMethod.ERROR_FIRST,
help="How should duplicate records be reported.")

shared_outputs = shared_parser.add_argument_group(
Expand Down Expand Up @@ -142,8 +145,8 @@ def run(args):
args.fasta,
args.seq_id_column,
args.seq_field,
DataErrorMethod(args.unmatched_reporting),
DataErrorMethod(args.duplicate_reporting))
args.unmatched_reporting,
args.duplicate_reporting)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand All @@ -152,7 +155,7 @@ def run(args):
)
elif args.metadata:
try:
records = read_table_to_dict(args.metadata, args.metadata_delimiters, DataErrorMethod(args.duplicate_reporting), args.id_column)
records = read_table_to_dict(args.metadata, args.metadata_delimiters, args.duplicate_reporting, args.id_column)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down
211 changes: 211 additions & 0 deletions augur/curate/format_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""
Format date fields to ISO 8601 dates (YYYY-MM-DD), where incomplete dates
are masked with 'XX' (e.g. 2023 -> 2023-XX-XX).
"""
import re
from datetime import datetime

from augur.argparse_ import SKIP_AUTO_DEFAULT_IN_HELP
from augur.errors import AugurError
from augur.io.print import print_err
from augur.types import DataErrorMethod
from .format_dates_directives import YEAR_DIRECTIVES, YEAR_MONTH_DIRECTIVES, YEAR_MONTH_DAY_DIRECTIVES


def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("format-dates",
parents=[parent_subparsers.shared_parser],
help=__doc__)

required = parser.add_argument_group(title="REQUIRED")
required.add_argument("--date-fields", nargs="+",
help="List of date field names in the record that need to be standardized.")
required.add_argument("--expected-date-formats", nargs="+",
help="Expected date formats that are currently in the provided date fields, " +
"defined by standard format codes as listed at " +
"https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes. " +
"If a date string matches multiple formats, it will be parsed as the first matched format in the provided order.")

optional = parser.add_argument_group(title="OPTIONAL")
optional.add_argument("--failure-reporting",
type=DataErrorMethod.argtype,
choices=list(DataErrorMethod),
default=DataErrorMethod.ERROR_FIRST,
help="How should failed date formatting be reported.")
optional.add_argument("--no-mask-failure", dest="mask_failure",
action="store_false",
help="Do not mask dates with 'XXXX-XX-XX' and return original date string if date formatting failed. " +
f"(default: False{SKIP_AUTO_DEFAULT_IN_HELP})")

return parser


def directive_is_included(potential_directives, date_format):
"""
Checks if any of the directives in *potential_directives* is included
in *date_format* string.
If an element within *potential_directives* is a tuple, then all directives
within the tuple must be included in *date_format*.
Parameters
----------
potential_directives: set[tuple[str, ...]]
Set of potential directives to check
date_format: str
Date format string to check for directives
Returns
-------
bool:
Whether the provided *date_format* includes any of the *potential_directives*
>>> potential_directives = {('%y', '%b', '%d'), ('%y', '%B', '%d'), ('%y', '%m', '%d'),}
>>> directive_is_included(potential_directives, '%G-%V-%A')
False
>>> directive_is_included(potential_directives, '%y-%m')
False
>>> directive_is_included(potential_directives, '%%y-%m-%d')
False
>>> directive_is_included(potential_directives, '%y-%m-%d')
True
>>> directive_is_included(potential_directives, '%y-%m-%dT%H:%M:%SZ')
True
"""
return any(
all(
# Exclude escaped directives (e.g. '%%Y' means literal '%Y' not a four digit year)
bool(re.search(f"(?<!%){re.escape(sub_directive)}", date_format))
for sub_directive in directive
)
for directive in potential_directives
)


def format_date(date_string, expected_formats):
"""
Format *date_string* to ISO 8601 date (YYYY-MM-DD) by trying to parse it
as one of the provided *expected_formats*.
Parameters
----------
date_string: str
Date string to format
expected_formats: list[str]
List of expected formats for the provided date string
Returns
-------
str or None:
Formatted date string or None if the parsing of the date string failed.
If *date_string* is an incomplete date, the date is masked with 'XX'.
Dates without year will be formatted as 'XXXX-XX-XX', even if month/day are known.
Dates without month will be formatted as 'YYYY-XX-XX', even if day is known.
Dates without day will be formatted as 'YYYY-MM-XX'.
>>> expected_formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d']
>>> format_date("01-01", expected_formats)
'XXXX-XX-XX'
>>> format_date("2020", expected_formats)
'2020-XX-XX'
>>> format_date("2020-01", expected_formats)
'2020-01-XX'
>>> format_date("2020-1-15", expected_formats)
'2020-01-15'
>>> format_date("2020-1-1", expected_formats)
'2020-01-01'
>>> format_date("2020-01-15", expected_formats)
'2020-01-15'
>>> format_date("2020-01-15T00:00:00Z", expected_formats)
'2020-01-15'
"""

for date_format in expected_formats:
try:
parsed_date = datetime.strptime(date_string, date_format)
except ValueError:
continue

# Default to date masked as 'XXXX-XX-XX' so we don't return incorrect dates
year_string = 'XXXX'
month_string = day_string = 'XX'

parsed_year_string = str(parsed_date.year)
parsed_month_string = str(parsed_date.month).zfill(2)
parsed_day_string = str(parsed_date.day).zfill(2)

# If directives for all year,month,day fields are included in date_format,
# then use all of the parsed field strings
if directive_is_included(YEAR_MONTH_DAY_DIRECTIVES, date_format):
year_string = parsed_year_string
month_string = parsed_month_string
day_string = parsed_day_string

# If directives only include year and month are included in date_format,
# then only use the parsed year and month field strings
elif directive_is_included(YEAR_MONTH_DIRECTIVES, date_format):
year_string = parsed_year_string
month_string = parsed_month_string

# If directives only include year in date_format, the only use the
# parsed year field string
elif directive_is_included(YEAR_DIRECTIVES, date_format):
year_string = parsed_year_string

return f"{year_string}-{month_string}-{day_string}"

return None


def run(args, records):
failures = []
failure_reporting = args.failure_reporting
for index, record in enumerate(records):
record = record.copy()
record_id = index

for field in args.date_fields:
date_string = record.get(field)

if not date_string:
continue

formatted_date_string = format_date(date_string, args.expected_date_formats)
if formatted_date_string is None:
# Mask failed date formatting before processing error methods
# to ensure failures are masked even when failures are "silent"
if args.mask_failure:
record[field] = "XXXX-XX-XX"

if failure_reporting is DataErrorMethod.SILENT:
continue

failure_message = f"Unable to format date string {date_string!r} in field {field!r} of record {record_id!r}."
if failure_reporting is DataErrorMethod.ERROR_FIRST:
raise AugurError(failure_message)

if failure_reporting is DataErrorMethod.WARN:
print_err(f"WARNING: {failure_message}")

# Keep track of failures for final summary
failures.append((record_id, field, date_string))
else:
record[field] = formatted_date_string

yield record

if failure_reporting is not DataErrorMethod.SILENT and failures:
failure_message = (
"Unable to format dates for the following (record, field, date string):\n" + \
'\n'.join(map(repr, failures))
)
if failure_reporting is DataErrorMethod.ERROR_ALL:
raise AugurError(failure_message)

elif failure_reporting is DataErrorMethod.WARN:
print_err(f"WARNING: {failure_message}")

else:
raise ValueError(f"Encountered unhandled failure reporting method: {failure_reporting!r}")
28 changes: 28 additions & 0 deletions augur/curate/format_dates_directives.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from itertools import product

year = {'%y', '%Y'}
month = {'%b', '%B', '%m'}
day = {'%d'}
month_and_day = {'%j'}
week = {'%U', '%W'}
day_of_week = {'%A', '%a', '%w', '%u'}

# Set of directives that can be converted to complete date with year, month, and day
YEAR_MONTH_DAY_DIRECTIVES = (
# Locale's full date representation
{('%c',),('%x',)} |
# Dates with ISO 8601 week dates for year ('%G' is NOT interchangeable with '%Y'), ISO 8601 week ('%V'), and weekdays
{('%G', '%V', '%A'),('%G', '%V', '%a'),('%G', '%V', '%w'),('%G', '%V', '%u')} |
# Dates with year, week, and weekday
set(product(year, week, day_of_week)) |
# Dates with year and day of the year
set(product(year, month_and_day)) |
# Dates with year, month, and day
set(product(year, month, day))
)

# Set of directives that can be converted to incomplete dates, missing the day
YEAR_MONTH_DIRECTIVES = set(product(year, month))

# Set of directives that can be converted to incomplete dates, missing the month and day
YEAR_DIRECTIVES = set(product(year))
9 changes: 9 additions & 0 deletions docs/usage/cli/curate/format-dates.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
============
format-dates
============

.. argparse::
:module: augur
:func: make_parser
:prog: augur
:path: curate format-dates
1 change: 1 addition & 0 deletions docs/usage/cli/curate/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ We will continue to add more subcommands as we identify other common data curati
:maxdepth: 1

normalize-strings
format-dates
passthru

1 change: 1 addition & 0 deletions tests/functional/curate/cram/_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"

0 comments on commit b03442c

Please sign in to comment.