Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filter: Use intermediate columns for grouping #1070

Merged
merged 3 commits into from Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES.md
Expand Up @@ -6,7 +6,12 @@

* filter: Add support to group by ISO week (`--group-by week`) during subsampling. [#1067][] (@victorlin)

### Bug Fixes

* filter: Fixed unintended behavior in which grouping by `day` would "work" when used with `month` and/or `year`. Updated so it will be ignored. [#1070][] (@victorlin)

[#1067]: https://github.com/nextstrain/augur/pull/1067
[#1070]: https://github.com/nextstrain/augur/pull/1070

## 18.0.0 (21 September 2022)

Expand Down
64 changes: 46 additions & 18 deletions augur/filter.py
Expand Up @@ -14,6 +14,7 @@
import random
import re
import sys
import uuid
from tempfile import NamedTemporaryFile
from typing import Collection

Expand Down Expand Up @@ -1026,58 +1027,84 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
metadata.drop(col, axis=1, inplace=True)

if 'date' not in metadata:
# set generated columns to 'unknown'
# Set generated columns to 'unknown'.
print(f"WARNING: A 'date' column could not be found to group-by {sorted(generated_columns_requested)}.", file=sys.stderr)
print(f"Filtering by group may behave differently than expected!", file=sys.stderr)
df_dates = pd.DataFrame({col: 'unknown' for col in GROUP_BY_GENERATED_COLUMNS}, index=metadata.index)
metadata = pd.concat([metadata, df_dates], axis=1)
else:
# replace date with year/month/day as nullable ints
date_cols = ['year', 'month', 'day']
# Create a DataFrame with year/month/day columns as nullable ints.
# These columns are prefixed to note temporary usage. They are used
# to generate other columns, and will be discarded at the end.
temp_prefix = str(uuid.uuid4())
temp_date_cols = [f'{temp_prefix}year', f'{temp_prefix}month', f'{temp_prefix}day']
df_dates = metadata['date'].str.split('-', n=2, expand=True)
df_dates = df_dates.set_axis(date_cols[:len(df_dates.columns)], axis=1)
missing_date_cols = set(date_cols) - set(df_dates.columns)
df_dates = df_dates.set_axis(temp_date_cols[:len(df_dates.columns)], axis=1)
missing_date_cols = set(temp_date_cols) - set(df_dates.columns)
for col in missing_date_cols:
df_dates[col] = pd.NA
for col in date_cols:
for col in temp_date_cols:
df_dates[col] = pd.to_numeric(df_dates[col], errors='coerce').astype(pd.Int64Dtype())

# Extend metadata with generated date columns
# Drop the 'date' column since it should not be used for grouping.
metadata = pd.concat([metadata.drop('date', axis=1), df_dates], axis=1)
if 'year' in generated_columns_requested:
# skip ambiguous years
df_skip = metadata[metadata['year'].isnull()]
metadata.dropna(subset=['year'], inplace=True)
# Skip ambiguous years.
df_skip = metadata[metadata[f'{temp_prefix}year'].isnull()]
metadata.dropna(subset=[f'{temp_prefix}year'], inplace=True)
for strain in df_skip.index:
skipped_strains.append({
"strain": strain,
"filter": "skip_group_by_with_ambiguous_year",
"kwargs": "",
})

# Make a generated 'year' column available for grouping.
metadata['year'] = metadata[f'{temp_prefix}year']

if 'month' in generated_columns_requested:
# skip ambiguous months
df_skip = metadata[metadata['month'].isnull()]
metadata.dropna(subset=['month'], inplace=True)
# Skip ambiguous months.
df_skip = metadata[metadata[f'{temp_prefix}month'].isnull()]
metadata.dropna(subset=[f'{temp_prefix}month'], inplace=True)
for strain in df_skip.index:
skipped_strains.append({
"strain": strain,
"filter": "skip_group_by_with_ambiguous_month",
"kwargs": "",
})
# month = (year, month)
metadata['month'] = list(zip(metadata['year'], metadata['month']))

# Make a generated 'month' column available for grouping.
metadata['month'] = list(zip(
metadata[f'{temp_prefix}year'],
metadata[f'{temp_prefix}month']
))

if 'week' in generated_columns_requested:
# skip ambiguous days
df_skip = metadata[metadata['day'].isnull()]
metadata.dropna(subset=['day'], inplace=True)
# Skip ambiguous days.
df_skip = metadata[metadata[f'{temp_prefix}day'].isnull()]
metadata.dropna(subset=[f'{temp_prefix}day'], inplace=True)
for strain in df_skip.index:
skipped_strains.append({
"strain": strain,
"filter": "skip_group_by_with_ambiguous_day",
"kwargs": "",
})

# Make a generated 'week' column available for grouping.
# Note that week = (year, week) from the date.isocalendar().
# Do not combine the raw year with the ISO week number alone,
# since raw year ≠ ISO year.
metadata['week'] = metadata.apply(lambda row: get_iso_year_week(row['year'], row['month'], row['day']), axis=1)
metadata['week'] = metadata.apply(lambda row: get_iso_year_week(
row[f'{temp_prefix}year'],
row[f'{temp_prefix}month'],
row[f'{temp_prefix}day']
), axis=1
)

# Drop the internally used columns.
for col in temp_date_cols:
metadata.drop(col, axis=1, inplace=True)

unknown_groups = group_by_set - set(metadata.columns)
if unknown_groups:
Expand All @@ -1086,6 +1113,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
for group in unknown_groups:
metadata[group] = 'unknown'

# Finally, determine groups.
group_by_strain = dict(zip(metadata.index, metadata[group_by].apply(tuple, axis=1)))
return group_by_strain, skipped_strains

Expand Down