Merge pull request #485 from nextstrain/allow_max_seqs_arg

Allow use of `--subsample-max-sequences` in `filter`
nextstrain · Sep 15, 2020 · d69d324 · d69d324
2 parents 67d0fd4 + 5a44a97
commit d69d324
Show file tree

Hide file tree

Showing 7 changed files with 138 additions and 24 deletions.
diff --git a/Snakefile b/Snakefile
@@ -1,11 +1,58 @@
+import copy
 from os import environ
 from socket import getfqdn
 from getpass import getuser
+from snakemake.logging import logger
 from snakemake.utils import validate
+import time
+
+# Store the user's configuration prior to loading defaults, so we can check for
+# reused subsampling scheme names in the user's config. We need to make a deep
+# copy because Snakemake will deep merge the subsampling dictionary later,
+# modifying the values of a reference or shallow copy. Note that this loading of
+# the user's config prior to the defaults below depends on the order Snakemake
+# loads its configfiles. Specifically, the order of config loading is:
+#
+# 1. First, configfile arguments are loaded and config is built from these [1].
+# 2. Then, config arguments are loaded and override existing config keys [2].
+# 3. Then, the Snakefile is parsed and configfile directive inside the Snakefile is processed [3].
+#    When configfile is loaded from the directive in the Snakefile, the config
+#    dictionary is deep merged with the files [4] from the externally provided
+#    config files. This is the only place the deep merge happens using the
+#    update_config function [5].
+#
+# [1] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L466-L471
+# [2] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L476-L477
+# [3] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L551-L553
+# [4] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/workflow.py#L1088-L1094
+# [5] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/utils.py#L455-L476
+user_subsampling = copy.deepcopy(config.get("subsampling", {}))
 
 configfile: "defaults/parameters.yaml"
 validate(config, schema="workflow/schemas/config.schema.yaml")
 
+# Check for overlapping subsampling schemes in user and default
+# configurations. For now, issue a deprecation warning, so users know they
+# should rename their subsampling schemes. In the future, this reuse of the same
+# name will cause an error.
+subsampling_config = config.get("subsampling", {})
+overlapping_schemes = []
+for scheme_name, scheme in user_subsampling.items():
+    if scheme_name in subsampling_config and subsampling_config.get(scheme_name) != scheme:
+        overlapping_schemes.append(scheme_name)
+
+if len(overlapping_schemes) > 0:
+    logger.warning(f"WARNING: The following subsampling scheme(s) have the same name as a default scheme in this workflow but different definitions:")
+    logger.warning("")
+    for scheme in overlapping_schemes:
+        logger.warning(f"  - {scheme}")
+    logger.warning("")
+    logger.warning("  This means Snakemake will merge your scheme with the default scheme and may produce unexpected behavior.")
+    logger.warning(f"  To avoid errors in your workflow, rename your schemes with unique names (e.g., 'custom_{overlapping_schemes[0]}')")
+    logger.warning("  In future versions of this workflow, overlapping subsampling scheme names will produce an error.")
+    logger.warning("")
+    time.sleep(5)
+
 # default build if none specified in config
 if "builds" not in config:
     config["builds"] = {

diff --git a/docs/customizing-analysis.md b/docs/customizing-analysis.md
@@ -62,10 +62,45 @@ We implement hierarchical subsampling by producing multiple samples at different
 A build can specify any number of such samples which can be flexibly restricted to particular meta data fields and subsampled from groups with particular properties.
 When specifying subsampling in this way, we'll first take sequences from the 'focal' area, and the select samples from other geographical areas.
 Read further for information on how we select these samples.
+Here, we'll look at the advanced example (`./my_profiles/example_advanced_customization`) file to explain some of the options.
 
-In this example, we'll look at a subsampling scheme which defines a `canton`.
+When specifying how many sequences you want in a subsampling level (for example, from a country or a region), you can do this using either `seq_per_group` or `max_sequences` - these work with the `group_by` argument.
+For example, `switzerland` subsampling rules in the advanced example looks like this:
+```yaml
+switzerland:
+  # Focal samples for country
+  country:
+    group_by: "division year month"
+    max_sequences: 1500
+    exclude: "--exclude-where 'country!={country}'"
+  # Contextual samples from country's region
+  region:
+    group_by: "country year month"
+    seq_per_group: 20
+    exclude: "--exclude-where 'country={country}' 'region!={region}'"
+    priorities:
+      type: "proximity"
+      focus: "country"
+  # Contextual samples from the rest of the world,
+  # excluding the current region to avoid resampling.
+  global:
+    group_by: "country year month"
+    seq_per_group: 10
+    exclude: "--exclude-where 'region={region}'"
+    priorities:
+      type: "proximity"
+      focus: "country"
+```
+
+For `country`-level sampling above, we specify that we want a maximum of 1,500 sequences from the country in question (here, Switzerland).
+Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total.
+
+Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20.
+This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken).
+
+Now we'll look at a subsampling scheme which defines a multi-`canton` build.
 Cantons are regional divisions in Switzerland - below 'country,' but above 'location' (often city-level).
-In the advanced example (`./my_profiles/example_advanced_customization`), we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world.
+In the advanced example, we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world.
 
 For cantons this looks like this:
 ```yaml

diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml
@@ -16,7 +16,7 @@ builds:
   # with a build name that will produce the following URL fragment on Nextstrain/auspice:
   # /ncov/north-america/usa/washington/king-county
   north-america_usa_washington_king-county: # name of the build; this can be anything
-    subsampling_scheme: county # use a custom subsampling scheme defined below
+    subsampling_scheme: custom-county # use a custom subsampling scheme defined below
     region: North America
     country: USA
     division: Washington
@@ -60,7 +60,7 @@ builds:
 
 # Define custom subsampling logic for county-level builds.
 subsampling:
-  county:
+  custom-county:
     # Focal samples for location
     focal:
       group_by: "year month"

diff --git a/my_profiles/example_advanced_customization/builds.yaml b/my_profiles/example_advanced_customization/builds.yaml
@@ -38,7 +38,7 @@ subsampling:
     # Focal samples for country
     country:
       group_by: "division year month"
-      seq_per_group: 200
+      max_sequences: 1500
       exclude: "--exclude-where 'country!={country}'"
     # Contextual samples from country's region
     region:
@@ -93,7 +93,7 @@ subsampling:
         focus: "division"
 
   # This build will take from 3 cantons - we have a sample rule for each,
-  # rather than just one division that's focal build 
+  # rather than just one division that's focal build
   lac-leman:
     # focal samples
     geneva:

diff --git a/nextstrain_profiles/nextstrain/builds.yaml b/nextstrain_profiles/nextstrain/builds.yaml
@@ -8,30 +8,30 @@ custom_rules:
 # (They override the defaults)
 builds:
   global:
-    subsampling_scheme: region_global
+    subsampling_scheme: nextstrain_region_global
     auspice_config: "nextstrain_profiles/nextstrain/global_auspice_config.json"
   africa:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: Africa
     auspice_config: "nextstrain_profiles/nextstrain/africa_auspice_config.json"
   asia:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: Asia
     auspice_config: "nextstrain_profiles/nextstrain/asia_auspice_config.json"
   europe:
-    subsampling_scheme: region_grouped_by_country
+    subsampling_scheme: nextstrain_region_grouped_by_country
     region: Europe
     auspice_config: "nextstrain_profiles/nextstrain/europe_auspice_config.json"
   north-america:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: North America
     auspice_config: "nextstrain_profiles/nextstrain/north-america_auspice_config.json"
   oceania:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: Oceania
     auspice_config: "nextstrain_profiles/nextstrain/oceania_auspice_config.json"
   south-america:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: South America
     auspice_config: "nextstrain_profiles/nextstrain/south-america_auspice_config.json"
 
@@ -105,7 +105,7 @@ slack_channel: "#ncov-gisaid-updates"
 
 subsampling:
   # Custom subsampling logic for regions
-  region:
+  nextstrain_region:
     # Focal samples for region
     region:
       group_by: "division year month"
@@ -121,14 +121,14 @@ subsampling:
         focus: "region"
 
   # Custom subsampling logic for global region.
-  region_global:
+  nextstrain_region_global:
     global:
       group_by: "country year month"
       seq_per_group: 16
 
   # Custom subsampling for regions like Europe where grouping by country
   # is the smallest resolution requied
-  region_grouped_by_country:
+  nextstrain_region_grouped_by_country:
     # Focal samples for region
     region:
       group_by: "country year month"

diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
@@ -19,6 +19,6 @@ dependencies:
 - pip
 - pip:
   - awscli==1.18.45
-  - nextstrain-augur==9.0.0
+  - nextstrain-augur==10.0.2
   - nextstrain-cli==1.16.5
   - rethinkdb==2.3.0.post6
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -281,9 +281,22 @@ def _get_subsampling_settings(wildcards):
     subsampling_settings = config["subsampling"][subsampling_scheme]
 
     if hasattr(wildcards, "subsample"):
-        return subsampling_settings[wildcards.subsample]
-    else:
-        return subsampling_settings
+        subsampling_settings = subsampling_settings[wildcards.subsample]
+
+        # If users have supplied both `max_sequences` and `seq_per_group`, we
+        # throw an error instead of assuming the user prefers one setting over
+        # another by default.
+        if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
+            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. If you didn't define both of these settings, this conflict could be caused by using the same subsampling scheme name as a default scheme. In this case, rename your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}') and run the workflow again.")
+
+        # If users have supplied neither `max_sequences` nor `seq_per_group`, we
+        # throw an error because the subsampling rule will still group by one or
+        # more fields and the lack of limits on this grouping could produce
+        # unexpected behavior.
+        if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"):
+            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.")
+
+    return subsampling_settings
 
 
 def get_priorities(wildcards):
@@ -317,8 +330,17 @@ def _get_specific_subsampling_setting(setting, optional=False):
             # build's region, country, division, etc. as needed for subsampling.
             build = config["builds"][wildcards.build_name]
             value = value.format(**build)
-        else:
+        elif value is not None:
+            # If is 'seq_per_group' or 'max_sequences' build subsampling setting,
+            # need to return the 'argument' for augur
+            if setting == 'seq_per_group':
+                value = f"--sequences-per-group {value}"
+            elif setting == 'max_sequences':
+                value = f"--subsample-max-sequences {value}"
+
             return value
+        else:
+            value = ""
 
         # Check format strings that haven't been resolved.
         if re.search(r'\{.+\}', value):
@@ -331,7 +353,15 @@ def _get_specific_subsampling_setting(setting, optional=False):
 rule subsample:
     message:
         """
-        Subsample all sequences into a {wildcards.subsample} set for build '{wildcards.build_name}' with {params.sequences_per_group} per {params.group_by}
+        Subsample all sequences by '{wildcards.subsample}' scheme for build '{wildcards.build_name}' with the following parameters:
+
+         - group by: {params.group_by}
+         - sequences per group: {params.sequences_per_group}
+         - subsample max sequences: {params.subsample_max_sequences}
+         - exclude: {params.exclude_argument}
+         - include: {params.include_argument}
+         - query: {params.query_argument}
+         - priority: {params.priority_argument}
         """
     input:
         sequences = rules.mask.output.alignment,
@@ -342,7 +372,8 @@ rule subsample:
         sequences = "results/{build_name}/sample-{subsample}.fasta"
     params:
         group_by = _get_specific_subsampling_setting("group_by"),
-        sequences_per_group = _get_specific_subsampling_setting("seq_per_group"),
+        sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True),
+        subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True),
         exclude_argument = _get_specific_subsampling_setting("exclude", optional=True),
         include_argument = _get_specific_subsampling_setting("include", optional=True),
         query_argument = _get_specific_subsampling_setting("query", optional=True),
@@ -359,7 +390,8 @@ rule subsample:
             {params.query_argument} \
             {params.priority_argument} \
             --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
+            {params.sequences_per_group} \
+            {params.subsample_max_sequences} \
             --output {output.sequences} 2>&1 | tee {log}
         """