Skip to content

Commit

Permalink
Merge pull request #485 from nextstrain/allow_max_seqs_arg
Browse files Browse the repository at this point in the history
Allow use of `--subsample-max-sequences` in `filter`
  • Loading branch information
huddlej committed Sep 15, 2020
2 parents 67d0fd4 + 5a44a97 commit d69d324
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 24 deletions.
47 changes: 47 additions & 0 deletions Snakefile
@@ -1,11 +1,58 @@
import copy
from os import environ
from socket import getfqdn
from getpass import getuser
from snakemake.logging import logger
from snakemake.utils import validate
import time

# Store the user's configuration prior to loading defaults, so we can check for
# reused subsampling scheme names in the user's config. We need to make a deep
# copy because Snakemake will deep merge the subsampling dictionary later,
# modifying the values of a reference or shallow copy. Note that this loading of
# the user's config prior to the defaults below depends on the order Snakemake
# loads its configfiles. Specifically, the order of config loading is:
#
# 1. First, configfile arguments are loaded and config is built from these [1].
# 2. Then, config arguments are loaded and override existing config keys [2].
# 3. Then, the Snakefile is parsed and configfile directive inside the Snakefile is processed [3].
# When configfile is loaded from the directive in the Snakefile, the config
# dictionary is deep merged with the files [4] from the externally provided
# config files. This is the only place the deep merge happens using the
# update_config function [5].
#
# [1] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L466-L471
# [2] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L476-L477
# [3] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L551-L553
# [4] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/workflow.py#L1088-L1094
# [5] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/utils.py#L455-L476
user_subsampling = copy.deepcopy(config.get("subsampling", {}))

configfile: "defaults/parameters.yaml"
validate(config, schema="workflow/schemas/config.schema.yaml")

# Check for overlapping subsampling schemes in user and default
# configurations. For now, issue a deprecation warning, so users know they
# should rename their subsampling schemes. In the future, this reuse of the same
# name will cause an error.
subsampling_config = config.get("subsampling", {})
overlapping_schemes = []
for scheme_name, scheme in user_subsampling.items():
if scheme_name in subsampling_config and subsampling_config.get(scheme_name) != scheme:
overlapping_schemes.append(scheme_name)

if len(overlapping_schemes) > 0:
logger.warning(f"WARNING: The following subsampling scheme(s) have the same name as a default scheme in this workflow but different definitions:")
logger.warning("")
for scheme in overlapping_schemes:
logger.warning(f" - {scheme}")
logger.warning("")
logger.warning(" This means Snakemake will merge your scheme with the default scheme and may produce unexpected behavior.")
logger.warning(f" To avoid errors in your workflow, rename your schemes with unique names (e.g., 'custom_{overlapping_schemes[0]}')")
logger.warning(" In future versions of this workflow, overlapping subsampling scheme names will produce an error.")
logger.warning("")
time.sleep(5)

# default build if none specified in config
if "builds" not in config:
config["builds"] = {
Expand Down
39 changes: 37 additions & 2 deletions docs/customizing-analysis.md
Expand Up @@ -62,10 +62,45 @@ We implement hierarchical subsampling by producing multiple samples at different
A build can specify any number of such samples which can be flexibly restricted to particular meta data fields and subsampled from groups with particular properties.
When specifying subsampling in this way, we'll first take sequences from the 'focal' area, and the select samples from other geographical areas.
Read further for information on how we select these samples.
Here, we'll look at the advanced example (`./my_profiles/example_advanced_customization`) file to explain some of the options.

In this example, we'll look at a subsampling scheme which defines a `canton`.
When specifying how many sequences you want in a subsampling level (for example, from a country or a region), you can do this using either `seq_per_group` or `max_sequences` - these work with the `group_by` argument.
For example, `switzerland` subsampling rules in the advanced example looks like this:
```yaml
switzerland:
# Focal samples for country
country:
group_by: "division year month"
max_sequences: 1500
exclude: "--exclude-where 'country!={country}'"
# Contextual samples from country's region
region:
group_by: "country year month"
seq_per_group: 20
exclude: "--exclude-where 'country={country}' 'region!={region}'"
priorities:
type: "proximity"
focus: "country"
# Contextual samples from the rest of the world,
# excluding the current region to avoid resampling.
global:
group_by: "country year month"
seq_per_group: 10
exclude: "--exclude-where 'region={region}'"
priorities:
type: "proximity"
focus: "country"
```

For `country`-level sampling above, we specify that we want a maximum of 1,500 sequences from the country in question (here, Switzerland).
Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total.

Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20.
This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken).

Now we'll look at a subsampling scheme which defines a multi-`canton` build.
Cantons are regional divisions in Switzerland - below 'country,' but above 'location' (often city-level).
In the advanced example (`./my_profiles/example_advanced_customization`), we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world.
In the advanced example, we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world.

For cantons this looks like this:
```yaml
Expand Down
4 changes: 2 additions & 2 deletions my_profiles/example/builds.yaml
Expand Up @@ -16,7 +16,7 @@ builds:
# with a build name that will produce the following URL fragment on Nextstrain/auspice:
# /ncov/north-america/usa/washington/king-county
north-america_usa_washington_king-county: # name of the build; this can be anything
subsampling_scheme: county # use a custom subsampling scheme defined below
subsampling_scheme: custom-county # use a custom subsampling scheme defined below
region: North America
country: USA
division: Washington
Expand Down Expand Up @@ -60,7 +60,7 @@ builds:

# Define custom subsampling logic for county-level builds.
subsampling:
county:
custom-county:
# Focal samples for location
focal:
group_by: "year month"
Expand Down
4 changes: 2 additions & 2 deletions my_profiles/example_advanced_customization/builds.yaml
Expand Up @@ -38,7 +38,7 @@ subsampling:
# Focal samples for country
country:
group_by: "division year month"
seq_per_group: 200
max_sequences: 1500
exclude: "--exclude-where 'country!={country}'"
# Contextual samples from country's region
region:
Expand Down Expand Up @@ -93,7 +93,7 @@ subsampling:
focus: "division"

# This build will take from 3 cantons - we have a sample rule for each,
# rather than just one division that's focal build
# rather than just one division that's focal build
lac-leman:
# focal samples
geneva:
Expand Down
20 changes: 10 additions & 10 deletions nextstrain_profiles/nextstrain/builds.yaml
Expand Up @@ -8,30 +8,30 @@ custom_rules:
# (They override the defaults)
builds:
global:
subsampling_scheme: region_global
subsampling_scheme: nextstrain_region_global
auspice_config: "nextstrain_profiles/nextstrain/global_auspice_config.json"
africa:
subsampling_scheme: region
subsampling_scheme: nextstrain_region
region: Africa
auspice_config: "nextstrain_profiles/nextstrain/africa_auspice_config.json"
asia:
subsampling_scheme: region
subsampling_scheme: nextstrain_region
region: Asia
auspice_config: "nextstrain_profiles/nextstrain/asia_auspice_config.json"
europe:
subsampling_scheme: region_grouped_by_country
subsampling_scheme: nextstrain_region_grouped_by_country
region: Europe
auspice_config: "nextstrain_profiles/nextstrain/europe_auspice_config.json"
north-america:
subsampling_scheme: region
subsampling_scheme: nextstrain_region
region: North America
auspice_config: "nextstrain_profiles/nextstrain/north-america_auspice_config.json"
oceania:
subsampling_scheme: region
subsampling_scheme: nextstrain_region
region: Oceania
auspice_config: "nextstrain_profiles/nextstrain/oceania_auspice_config.json"
south-america:
subsampling_scheme: region
subsampling_scheme: nextstrain_region
region: South America
auspice_config: "nextstrain_profiles/nextstrain/south-america_auspice_config.json"

Expand Down Expand Up @@ -105,7 +105,7 @@ slack_channel: "#ncov-gisaid-updates"

subsampling:
# Custom subsampling logic for regions
region:
nextstrain_region:
# Focal samples for region
region:
group_by: "division year month"
Expand All @@ -121,14 +121,14 @@ subsampling:
focus: "region"

# Custom subsampling logic for global region.
region_global:
nextstrain_region_global:
global:
group_by: "country year month"
seq_per_group: 16

# Custom subsampling for regions like Europe where grouping by country
# is the smallest resolution requied
region_grouped_by_country:
nextstrain_region_grouped_by_country:
# Focal samples for region
region:
group_by: "country year month"
Expand Down
2 changes: 1 addition & 1 deletion workflow/envs/nextstrain.yaml
Expand Up @@ -19,6 +19,6 @@ dependencies:
- pip
- pip:
- awscli==1.18.45
- nextstrain-augur==9.0.0
- nextstrain-augur==10.0.2
- nextstrain-cli==1.16.5
- rethinkdb==2.3.0.post6
46 changes: 39 additions & 7 deletions workflow/snakemake_rules/main_workflow.smk
Expand Up @@ -281,9 +281,22 @@ def _get_subsampling_settings(wildcards):
subsampling_settings = config["subsampling"][subsampling_scheme]

if hasattr(wildcards, "subsample"):
return subsampling_settings[wildcards.subsample]
else:
return subsampling_settings
subsampling_settings = subsampling_settings[wildcards.subsample]

# If users have supplied both `max_sequences` and `seq_per_group`, we
# throw an error instead of assuming the user prefers one setting over
# another by default.
if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. If you didn't define both of these settings, this conflict could be caused by using the same subsampling scheme name as a default scheme. In this case, rename your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}') and run the workflow again.")

# If users have supplied neither `max_sequences` nor `seq_per_group`, we
# throw an error because the subsampling rule will still group by one or
# more fields and the lack of limits on this grouping could produce
# unexpected behavior.
if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"):
raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.")

return subsampling_settings


def get_priorities(wildcards):
Expand Down Expand Up @@ -317,8 +330,17 @@ def _get_specific_subsampling_setting(setting, optional=False):
# build's region, country, division, etc. as needed for subsampling.
build = config["builds"][wildcards.build_name]
value = value.format(**build)
else:
elif value is not None:
# If is 'seq_per_group' or 'max_sequences' build subsampling setting,
# need to return the 'argument' for augur
if setting == 'seq_per_group':
value = f"--sequences-per-group {value}"
elif setting == 'max_sequences':
value = f"--subsample-max-sequences {value}"

return value
else:
value = ""

# Check format strings that haven't been resolved.
if re.search(r'\{.+\}', value):
Expand All @@ -331,7 +353,15 @@ def _get_specific_subsampling_setting(setting, optional=False):
rule subsample:
message:
"""
Subsample all sequences into a {wildcards.subsample} set for build '{wildcards.build_name}' with {params.sequences_per_group} per {params.group_by}
Subsample all sequences by '{wildcards.subsample}' scheme for build '{wildcards.build_name}' with the following parameters:
- group by: {params.group_by}
- sequences per group: {params.sequences_per_group}
- subsample max sequences: {params.subsample_max_sequences}
- exclude: {params.exclude_argument}
- include: {params.include_argument}
- query: {params.query_argument}
- priority: {params.priority_argument}
"""
input:
sequences = rules.mask.output.alignment,
Expand All @@ -342,7 +372,8 @@ rule subsample:
sequences = "results/{build_name}/sample-{subsample}.fasta"
params:
group_by = _get_specific_subsampling_setting("group_by"),
sequences_per_group = _get_specific_subsampling_setting("seq_per_group"),
sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True),
subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True),
exclude_argument = _get_specific_subsampling_setting("exclude", optional=True),
include_argument = _get_specific_subsampling_setting("include", optional=True),
query_argument = _get_specific_subsampling_setting("query", optional=True),
Expand All @@ -359,7 +390,8 @@ rule subsample:
{params.query_argument} \
{params.priority_argument} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
{params.sequences_per_group} \
{params.subsample_max_sequences} \
--output {output.sequences} 2>&1 | tee {log}
"""

Expand Down

0 comments on commit d69d324

Please sign in to comment.