nextstrain · rneher · Jun 10, 2022 · Jun 9, 2022 · Jun 9, 2022 · tsibley
diff --git a/ingest/bin/transform-date-fields b/ingest/bin/transform-date-fields
@@ -9,14 +9,20 @@ from datetime import datetime
 from sys import stderr, stdin, stdout
 
 
-def format_date(date_string: str, expected_formats: set) -> str:
+def format_date(date_string: str, expected_formats: list) -> str:
     """
     Originally from nextstrain/ncov-ingest
 
     Format *date_string* to ISO 8601 date (YYYY-MM-DD).
     If *date_string* does not match *expected_formats*, return *date_string*.
+    If *date_string* is missing the year, return masked date 'XXXX-XX-XX'.
+    If *date_string* is an incomplete date (i.e. missing month or day), then
+    missing values are masked with 'XX'.
 
-    >>> expected_formats = {'%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ'}
+    >>> expected_formats = ['%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d']
+
+    >>> format_date("01-01", expected_formats)
+    'XXXX-XX-XX'
 
     >>> format_date("2020", expected_formats)
     '2020-XX-XX'
@@ -36,14 +42,79 @@ def format_date(date_string: str, expected_formats: set) -> str:
     >>> format_date("2020-01-15T00:00:00Z", expected_formats)
     '2020-01-15'
     """
+    # Potential directives that datetime accepts that can return the correct year, month, day fields
+    # see https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
+    #
+    # Allows us to check if year/month/day are included in the date format so we
+    # know when to mask incomplete dates with 'XX'
+    all_field_directives = {'%c', '%x',
+        ('%G', '%V', '%A'), ('%G', '%V', '%a'), ('%G', '%V', '%w'), ('%G', '%V', '%u')
+    }
+    month_and_day_directives = {'%j',
+        ('%U', '%A'), ('%U', '%a'), ('%U', '%w'), ('%U', '%u'),
+        ('%W', '%A'), ('%W', '%a'), ('%W', '%w'), ('%W', '%u')
+    }
+    year_directives = {'%y', '%Y'}
+    month_directives = {'%b', '%B', '%m'}
+    day_directives = {'%d'}
+
+    def directive_is_included(potential_directives: set, date_format: str) -> bool:
+        """
+        Checks if any of the directives in *potential_directives* is included
+        in *date_format* string.
+
+        If an element within *potential_directives* is a tuple, then all directives
+        within the tuple must be included in *date_format*.
+        """
+        return any(
+            (
+                (isinstance(directive, str) and directive in date_format) or
+                (isinstance(directive, tuple) and all(sub_directive in date_format for sub_directive in directive))
+            )
+            for directive in potential_directives
+        )
+
     for date_format in expected_formats:
         try:
             parsed_date = datetime.strptime(date_string, date_format)
         except ValueError:
             continue
-        year_string = str(parsed_date.year)
-        month_string = str(parsed_date.month).zfill(2) if date_string.count('-') >= 1 else 'XX'
-        day_string = str(parsed_date.day).zfill(2) if date_string.count('-') == 2 else 'XX'
+
+        # Default to date masked as 'XXXX-XX-XX' so we don't return incorrect dates
+        year_string = 'XXXX'
+        month_string = day_string = 'XX'
+
+        parsed_year_string = str(parsed_date.year)
+        parsed_month_string = str(parsed_date.month).zfill(2)
+        parsed_day_string = str(parsed_date.day).zfill(2)
+
+        # If a directive for ALL fields is included in date format,
+        # then use all of the parsed field strings
+        if (directive_is_included(all_field_directives, date_format)):
+            year_string = parsed_year_string
+            month_string = parsed_month_string
+            day_string = parsed_day_string
+
+        # If not all fields directives are included, then check year
+        # directive was included in date format
+        elif(directive_is_included(year_directives, date_format)):
+            year_string = parsed_year_string
+
+            # Only check for month and day directives if year is included
+            # Check if directive for BOTH month and year is included in date format
+            if (directive_is_included(month_and_day_directives, date_format)):
+                month_string = parsed_month_string
+                day_string = parsed_day_string
+
+            # If not directives for BOTH month and day are included, then check
+            # month directive was included in date format
+            elif(directive_is_included(month_directives, date_format)):
+                month_string = parsed_month_string
+
+                # Only check for day directives if month is included
+                if(directive_is_included(day_directives, date_format)):
+                    day_string = parsed_day_string
+
         return f"{year_string}-{month_string}-{day_string}"
 
     if date_string:
@@ -64,11 +135,12 @@ if __name__ == '__main__':
     parser.add_argument("--date-fields", nargs="+",
         help="List of date field names in the NDJSON record that need to be standardized.")
     parser.add_argument("--expected-date-formats", nargs="+",
-        help="Expected date formats that are currently in the provided date fields")
+        help="Expected date formats that are currently in the provided date fields." +
+             "If a date string matches multiple formats, it will be parsed as the first format in the list.")
 
     args = parser.parse_args()
 
-    expected_formats = set(args.expected_date_formats)
+    expected_formats = args.expected_date_formats
 
     for record in stdin:
         record = json.loads(record)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -15,6 +15,8 @@ transform:
   # List of date fields to standardize
   date_fields: ['date', 'date_submitted']
   # Expected date formats present in date fields
+  # These date formats should use directives expected by datetime
+  # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
   expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
   # Titlecase rules
   titlecase: