Merge pull request #1252 from nextstrain/victorlin/read-metadata-dtypes

Use less dtype inference when reading metadata into DataFrames
nextstrain · Jan 24, 2024 · cae562a · cae562a
2 parents 44dc4b6 + 7e81765
commit cae562a
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 13 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,17 @@
 
 ## __NEXT__
 
+### Features
+
+* `augur.io.read_metadata`: A new optional `dtype` argument allows custom data types for all columns. Automatic type inference still happens by default, so this is not a breaking change. [#1252][] (@victorlin)
+
+
+### Bug Fixes
+
+* filter, frequencies, refine: Speed up reading of the metadata file. [#1252][] (@victorlin)
+* traits: Previously, columns with only numeric values were treated as numerical data. These are now treated as categorical data for discrete trait analysis. [#1252][] (@victorlin)
+
+[#1252]: https://github.com/nextstrain/augur/pull/1252
 
 ## 24.0.0 (22 January 2024)
 

diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -169,6 +169,7 @@ def run(args):
             delimiters=args.metadata_delimiters,
             id_columns=args.metadata_id_columns,
             chunk_size=args.metadata_chunk_size,
+            dtype="string",
         )
     except InvalidDelimiter:
         raise AugurError(
@@ -320,6 +321,7 @@ def run(args):
             delimiters=args.metadata_delimiters,
             id_columns=args.metadata_id_columns,
             chunk_size=args.metadata_chunk_size,
+            dtype="string",
         )
         for metadata in metadata_reader:
             # Recalculate groups for subsampling as we loop through the

diff --git a/augur/frequencies.py b/augur/frequencies.py
@@ -85,7 +85,14 @@ def format_frequencies(freq):
 
 def run(args):
     try:
-        metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters, id_columns=args.metadata_id_columns)
+        # TODO: load only the ID, date, and --weights-attribute columns when
+        # read_metadata supports loading a subset of all columns.
+        metadata = read_metadata(
+            args.metadata,
+            delimiters=args.metadata_delimiters,
+            id_columns=args.metadata_id_columns,
+            dtype="string",
+        )
     except InvalidDelimiter:
         raise AugurError(
             f"Could not determine the delimiter of {args.metadata!r}. "

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
@@ -24,7 +24,7 @@ class InvalidDelimiter(Exception):
     pass
 
 
-def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAULT_ID_COLUMNS, chunk_size=None):
+def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAULT_ID_COLUMNS, chunk_size=None, dtype=None):
     r"""Read metadata from a given filename and into a pandas `DataFrame` or
     `TextFileReader` object.
 
@@ -40,7 +40,9 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAU
         Only one id column will be inferred.
     chunk_size : int
         Size of chunks to stream from disk with an iterator instead of loading the entire input file into memory.
-
+    dtype : dict or str
+        Data types to apply to columns in metadata. If unspecified, pandas data type inference will be used.
+        See documentation for an argument of the same name to `pandas.read_csv()`.
     Returns
     -------
     pandas.DataFrame or `pandas.io.parsers.TextFileReader`
@@ -107,15 +109,31 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAU
     else:
         index_col = id_columns_present[0]
 
-    # If we found a valid column to index the DataFrame, specify that column and
-    # also tell pandas that the column should be treated like a string instead
-    # of having its type inferred. This latter argument allows users to provide
-    # numerical ids that don't get converted to numbers by pandas.
+    # If we found a valid column to index the DataFrame, specify that column.
     kwargs["index_col"] = index_col
-    kwargs["dtype"] = {
-        index_col: "string",
-        METADATA_DATE_COLUMN: "string"
-    }
+
+    if dtype is None:
+        dtype = {}
+
+    if isinstance(dtype, dict):
+        # Avoid reading numerical IDs as integers.
+        dtype["index_col"] = "string"
+
+        # Avoid reading year-only dates as integers.
+        dtype[METADATA_DATE_COLUMN] = "string"
+
+    elif isinstance(dtype, str):
+        if dtype != "string":
+            raise AugurError(f"""
+                dtype='{dtype}' converts values in all columns to be of type
+                '{dtype}'. However, values in columns '{index_col}' and
+                '{METADATA_DATE_COLUMN}' must be treated as strings in Augur.
+                Specify dtype as a dict per column instead.
+            """)
+    else:
+        raise AugurError(f"Unsupported value for dtype: '{dtype}'")
+
+    kwargs["dtype"] = dtype
 
     return pd.read_csv(
         metadata_file,

diff --git a/augur/refine.py b/augur/refine.py
@@ -214,10 +214,14 @@ def run(args):
             print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr)
             return 1
         try:
+            # TODO: load only the ID and date columns when read_metadata
+            # supports loading a subset of all columns.
             metadata = read_metadata(
                 args.metadata,
                 delimiters=args.metadata_delimiters,
-                id_columns=args.metadata_id_columns)
+                id_columns=args.metadata_id_columns,
+                dtype="string",
+            )
         except InvalidDelimiter:
             raise AugurError(
                 f"Could not determine the delimiter of {args.metadata!r}. "

diff --git a/augur/traits.py b/augur/traits.py
@@ -131,7 +131,11 @@ def run(args):
         traits = read_metadata(
             args.metadata,
             delimiters=args.metadata_delimiters,
-            id_columns=args.metadata_id_columns)
+            id_columns=args.metadata_id_columns,
+
+            # Read all columns as string for discrete trait analysis
+            dtype="string",
+        )
     except InvalidDelimiter:
         raise AugurError(
                 f"Could not determine the delimiter of {args.metadata!r}. "

diff --git a/tests/functional/filter/cram/filter-query-numerical.t b/tests/functional/filter/cram/filter-query-numerical.t
@@ -33,3 +33,28 @@ The 'category' column will fail when used with a numerical comparison.
   	'>=' not supported between instances of 'str' and 'float'
   Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
   [2]
+
+Create another metadata file for testing.
+
+  $ cat >metadata.tsv <<~~
+  > strain	metric1	metric2
+  > SEQ1	4	5
+  > SEQ2	5	9
+  > SEQ3	6	10
+  > ~~
+
+Use a Pandas query to filter by a numerical value.
+This relies on having proper data types associated with the columns. If < is
+comparing strings, it's likely that SEQ3 will be dropped or errors arise.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "metric1 > 4 & metric1 < metric2" \
+  >  --output-strains filtered_strains.txt
+  1 strains were dropped during filtering
+  \t1 of these were filtered out by the query: "metric1 > 4 & metric1 < metric2" (esc)
+  2 strains passed all filters
+
+  $ sort filtered_strains.txt
+  SEQ2
+  SEQ3