Skip to content

Commit

Permalink
Merge pull request #1252 from nextstrain/victorlin/read-metadata-dtypes
Browse files Browse the repository at this point in the history
Use less dtype inference when reading metadata into DataFrames
  • Loading branch information
victorlin committed Jan 24, 2024
2 parents 44dc4b6 + 7e81765 commit cae562a
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 13 deletions.
11 changes: 11 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@

## __NEXT__

### Features

* `augur.io.read_metadata`: A new optional `dtype` argument allows custom data types for all columns. Automatic type inference still happens by default, so this is not a breaking change. [#1252][] (@victorlin)


### Bug Fixes

* filter, frequencies, refine: Speed up reading of the metadata file. [#1252][] (@victorlin)
* traits: Previously, columns with only numeric values were treated as numerical data. These are now treated as categorical data for discrete trait analysis. [#1252][] (@victorlin)

[#1252]: https://github.com/nextstrain/augur/pull/1252

## 24.0.0 (22 January 2024)

Expand Down
2 changes: 2 additions & 0 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def run(args):
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
dtype="string",
)
except InvalidDelimiter:
raise AugurError(
Expand Down Expand Up @@ -320,6 +321,7 @@ def run(args):
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
dtype="string",
)
for metadata in metadata_reader:
# Recalculate groups for subsampling as we loop through the
Expand Down
9 changes: 8 additions & 1 deletion augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,14 @@ def format_frequencies(freq):

def run(args):
try:
metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters, id_columns=args.metadata_id_columns)
# TODO: load only the ID, date, and --weights-attribute columns when
# read_metadata supports loading a subset of all columns.
metadata = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns,
dtype="string",
)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down
38 changes: 28 additions & 10 deletions augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class InvalidDelimiter(Exception):
pass


def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAULT_ID_COLUMNS, chunk_size=None):
def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAULT_ID_COLUMNS, chunk_size=None, dtype=None):
r"""Read metadata from a given filename and into a pandas `DataFrame` or
`TextFileReader` object.
Expand All @@ -40,7 +40,9 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAU
Only one id column will be inferred.
chunk_size : int
Size of chunks to stream from disk with an iterator instead of loading the entire input file into memory.
dtype : dict or str
Data types to apply to columns in metadata. If unspecified, pandas data type inference will be used.
See documentation for an argument of the same name to `pandas.read_csv()`.
Returns
-------
pandas.DataFrame or `pandas.io.parsers.TextFileReader`
Expand Down Expand Up @@ -107,15 +109,31 @@ def read_metadata(metadata_file, delimiters=DEFAULT_DELIMITERS, id_columns=DEFAU
else:
index_col = id_columns_present[0]

# If we found a valid column to index the DataFrame, specify that column and
# also tell pandas that the column should be treated like a string instead
# of having its type inferred. This latter argument allows users to provide
# numerical ids that don't get converted to numbers by pandas.
# If we found a valid column to index the DataFrame, specify that column.
kwargs["index_col"] = index_col
kwargs["dtype"] = {
index_col: "string",
METADATA_DATE_COLUMN: "string"
}

if dtype is None:
dtype = {}

if isinstance(dtype, dict):
# Avoid reading numerical IDs as integers.
dtype["index_col"] = "string"

# Avoid reading year-only dates as integers.
dtype[METADATA_DATE_COLUMN] = "string"

elif isinstance(dtype, str):
if dtype != "string":
raise AugurError(f"""
dtype='{dtype}' converts values in all columns to be of type
'{dtype}'. However, values in columns '{index_col}' and
'{METADATA_DATE_COLUMN}' must be treated as strings in Augur.
Specify dtype as a dict per column instead.
""")
else:
raise AugurError(f"Unsupported value for dtype: '{dtype}'")

kwargs["dtype"] = dtype

return pd.read_csv(
metadata_file,
Expand Down
6 changes: 5 additions & 1 deletion augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,14 @@ def run(args):
print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr)
return 1
try:
# TODO: load only the ID and date columns when read_metadata
# supports loading a subset of all columns.
metadata = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns)
id_columns=args.metadata_id_columns,
dtype="string",
)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down
6 changes: 5 additions & 1 deletion augur/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@ def run(args):
traits = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns)
id_columns=args.metadata_id_columns,

# Read all columns as string for discrete trait analysis
dtype="string",
)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down
25 changes: 25 additions & 0 deletions tests/functional/filter/cram/filter-query-numerical.t
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,28 @@ The 'category' column will fail when used with a numerical comparison.
'>=' not supported between instances of 'str' and 'float'
Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
[2]
Create another metadata file for testing.
$ cat >metadata.tsv <<~~
> strain metric1 metric2
> SEQ1 4 5
> SEQ2 5 9
> SEQ3 6 10
> ~~
Use a Pandas query to filter by a numerical value.
This relies on having proper data types associated with the columns. If < is
comparing strings, it's likely that SEQ3 will be dropped or errors arise.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "metric1 > 4 & metric1 < metric2" \
> --output-strains filtered_strains.txt
1 strains were dropped during filtering
\t1 of these were filtered out by the query: "metric1 > 4 & metric1 < metric2" (esc)
2 strains passed all filters
$ sort filtered_strains.txt
SEQ2
SEQ3

0 comments on commit cae562a

Please sign in to comment.