From ad980d4304445a1c0329915678aabefb6767ef69 Mon Sep 17 00:00:00 2001
From: Emma Hodcroft <emmahodcroft@gmail.com>
Date: Fri, 21 Aug 2020 16:44:32 +0200
Subject: [PATCH 01/21] allow use of --subsample-max-sequences filter arg

---
 workflow/snakemake_rules/main_workflow.smk | 23 +++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 260812226..f52ae25a2 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -281,9 +281,14 @@ def _get_subsampling_settings(wildcards):
     subsampling_settings = config["subsampling"][subsampling_scheme]
 
     if hasattr(wildcards, "subsample"):
-        return subsampling_settings[wildcards.subsample]
-    else:
-        return subsampling_settings
+        subsampling_settings = subsampling_settings[wildcards.subsample]
+    
+    # If users have supplied 'max_sequences' argument, assume they wish to override
+    # our default 'seq_per_group' from 'parameters.yaml' - so remove the 'seq_per_group'
+    if 'max_sequences' in subsampling_settings and 'seq_per_group' in subsampling_settings:
+        subsampling_settings.pop('seq_per_group')
+
+    return subsampling_settings
 
 
 def get_priorities(wildcards):
@@ -318,6 +323,12 @@ def _get_specific_subsampling_setting(setting, optional=False):
             build = config["builds"][wildcards.build_name]
             value = value.format(**build)
         else:
+            # If is 'seq_per_group' or 'max_sequences' build subsampling setting,
+            # need to return the 'argument' for augur
+            if setting == 'seq_per_group':
+                value = "--sequences-per-group " + str(value)
+            elif setting == 'max_sequences':
+                value = "--subsample-max-sequences " + str(value)
             return value
 
         # Check format strings that haven't been resolved.
@@ -342,7 +353,8 @@ rule subsample:
         sequences = "results/{build_name}/sample-{subsample}.fasta"
     params:
         group_by = _get_specific_subsampling_setting("group_by"),
-        sequences_per_group = _get_specific_subsampling_setting("seq_per_group"),
+        sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True),
+        subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True),
         exclude_argument = _get_specific_subsampling_setting("exclude", optional=True),
         include_argument = _get_specific_subsampling_setting("include", optional=True),
         query_argument = _get_specific_subsampling_setting("query", optional=True),
@@ -359,7 +371,8 @@ rule subsample:
             {params.query_argument} \
             {params.priority_argument} \
             --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
+            {params.sequences_per_group} \
+            {params.subsample_max_sequences} \
             --output {output.sequences} 2>&1 | tee {log}
         """
 

From a4875dd30f0c83ba57a5095387e3601b67900140 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 31 Aug 2020 19:38:23 -0700
Subject: [PATCH 02/21] Require augur 10.0.0 in conda environments

Adds support for augur filter's max sequences argument introduced in version
10.0.0 and used by the ncov workflow.
---
 workflow/envs/nextstrain.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
index 7ac5e3704..640a525cb 100644
--- a/workflow/envs/nextstrain.yaml
+++ b/workflow/envs/nextstrain.yaml
@@ -19,6 +19,6 @@ dependencies:
 - pip
 - pip:
   - awscli==1.18.45
-  - nextstrain-augur==9.0.0
+  - nextstrain-augur==10.0.0
   - nextstrain-cli==1.16.5
   - rethinkdb==2.3.0.post6

From 76234e0cd7b5f80cb820a71f0aceaeeaf63e3dce Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 31 Aug 2020 20:08:10 -0700
Subject: [PATCH 03/21] Throw errors for improperly defined mutually
 exclusive/required args

Produces an informative error message when the user either:
 - defines both `sequences_per_group` and `subsample_max_sequences`
 - defines neither `sequences_per_group` nor `subsample_max_sequences`

The first error addresses the mutually exclusive nature of the two arguments and
alerts the user that they need to change a specific subsampling scheme to fix
the issue.

The second error addresses the required natures of the arguments and alerts the
user that they need to define one of the two subsampling limits.
---
 workflow/snakemake_rules/main_workflow.smk | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index f52ae25a2..99c212028 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -282,11 +282,19 @@ def _get_subsampling_settings(wildcards):
 
     if hasattr(wildcards, "subsample"):
         subsampling_settings = subsampling_settings[wildcards.subsample]
-    
-    # If users have supplied 'max_sequences' argument, assume they wish to override
-    # our default 'seq_per_group' from 'parameters.yaml' - so remove the 'seq_per_group'
-    if 'max_sequences' in subsampling_settings and 'seq_per_group' in subsampling_settings:
-        subsampling_settings.pop('seq_per_group')
+
+    # If users have supplied both `max_sequences` and `seq_per_group`, we throw
+    # an error instead of assuming the user prefers one setting over another by
+    # default.
+    if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
+        raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive.")
+
+    # If users have supplied neither `max_sequences` nor `seq_per_group`, we
+    # throw an error because the subsampling rule will still group by one or
+    # more fields and the lack of limits on this grouping could produce
+    # unexpected behavior.
+    if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"):
+        raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.")
 
     return subsampling_settings
 

From e5d401896957374fcdca0a6fd582ec99ca54f5a4 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 31 Aug 2020 20:11:14 -0700
Subject: [PATCH 04/21] Allow users to define empty keys in subsampling schemes

Checks whether subsampling values are explicitly undefined (equal to None) and
only attempts to process those values that are not None. If a None value is
passed, we convert it to an empty string, so it does not appear in the
subsampling shell command.

These changes allow users to explicitly state that they do not define a specific
subsampling key. This functionality is important because of the way Snakemake's
config data structures are deep merged. It is possible for the default values of
a subsampling scheme to conflict with a user's own custom subsampling scheme of
the same name. To avoid conflicts of mutually exclusive keys, users can override
the default keys with empty values.
---
 workflow/snakemake_rules/main_workflow.smk | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 99c212028..5ce12e200 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -330,14 +330,17 @@ def _get_specific_subsampling_setting(setting, optional=False):
             # build's region, country, division, etc. as needed for subsampling.
             build = config["builds"][wildcards.build_name]
             value = value.format(**build)
-        else:
+        elif value is not None:
             # If is 'seq_per_group' or 'max_sequences' build subsampling setting,
             # need to return the 'argument' for augur
             if setting == 'seq_per_group':
                 value = "--sequences-per-group " + str(value)
             elif setting == 'max_sequences':
                 value = "--subsample-max-sequences " + str(value)
+
             return value
+        else:
+            value = ""
 
         # Check format strings that haven't been resolved.
         if re.search(r'\{.+\}', value):

From 737513dba2f39a02f079b87b9eca96de38663752 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 31 Aug 2020 20:21:05 -0700
Subject: [PATCH 05/21] Explicitly define max_sequence entries to surface
 mutual exclusion

Adds an empty `max_sequences` key to our default subsampling rules and adds
comments to the first pair of keys in the default YAMLs to describe their
required and mutually exclusive behavior.
---
 defaults/parameters.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
index 7731d9376..e89f2292c 100644
--- a/defaults/parameters.yaml
+++ b/defaults/parameters.yaml
@@ -122,12 +122,20 @@ subsampling:
     # Focal samples for region
     region:
       group_by: "division year month"
+      # augur filter's `seq_per_group` and `max_sequences` are mutually
+      # exclusive, but one of these arguments is required for each subsampling
+      # scheme. To override these defaults in your own subsampling schemes of
+      # the same name, provide a value for the limit you want to use and an
+      # empty entry for the limit you don't use. The entry below demonstrates
+      # this approach.
       seq_per_group: 48
+      max_sequences:
       exclude: "--exclude-where 'region!={region}'"
     # Contextual samples for region from the rest of the world
     global:
       group_by: "country year month"
       seq_per_group: 4
+      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -138,6 +146,7 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 20
+      max_sequences:
 
   # Custom subsampling logic for regions like Europe where grouping by country
   # is the smallest resolution required.
@@ -146,11 +155,13 @@ subsampling:
     region:
       group_by: "country year month"
       seq_per_group: 64
+      max_sequences:
       exclude: "--exclude-where 'region!={region}'"
     # Contextual samples for region from the rest of the world
     global:
       group_by: "country year month"
       seq_per_group: 6
+      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -162,11 +173,13 @@ subsampling:
     country:
       group_by: "division year month"
       seq_per_group: 200
+      max_sequences:
       exclude: "--exclude-where 'country!={country}'"
     # Contextual samples from country's region
     region:
       group_by: "country year month"
       seq_per_group: 20
+      max_sequences:
       exclude: "--exclude-where 'country={country}' 'region!={region}'"
       priorities:
         type: "proximity"
@@ -176,6 +189,7 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 10
+      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -187,11 +201,13 @@ subsampling:
     division:
       group_by: "year month"
       seq_per_group: 300
+      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'"
     # Contextual samples from division's country
     country:
       group_by: "division year month"
       seq_per_group: 20
+      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'"
       priorities:
         type: "proximity"
@@ -200,6 +216,7 @@ subsampling:
     region:
       group_by: "country year month"
       seq_per_group: 10
+      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country={country}'"
       priorities:
         type: "proximity"
@@ -209,6 +226,7 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 5
+      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -220,11 +238,13 @@ subsampling:
     focal:
       group_by: "year month"
       seq_per_group: 300
+      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}' 'location!={location}'"
     # Samples that are genetically related to the focal samples
     related:
       group_by: "country year month"
       seq_per_group: 20
+      max_sequences:
       exclude: "--exclude-where 'location={location}'"
       priorities:
         type: "proximity"
@@ -233,4 +253,5 @@ subsampling:
     contextual:
       group_by: "country year month"
       seq_per_group: 10
+      max_sequences:
       exclude: "--exclude-where 'location={location}'"

From 5440eb482d3f5a929bd422e47bfe638888a56591 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 31 Aug 2020 20:32:05 -0700
Subject: [PATCH 06/21] Update the message for subsample rule to reflect
 different arguments

Clarifies that the subsample wildcard refers to a specific named subsampling
scheme and explicitly lists all of the parameters used by a given subsampling
rule. This approach errs on the side of verbosity to provide a better sense of
what each subsampling rule is actually doing than looking at the augur filter
command itself.
---
 workflow/snakemake_rules/main_workflow.smk | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 5ce12e200..ea9fa4aac 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -353,7 +353,15 @@ def _get_specific_subsampling_setting(setting, optional=False):
 rule subsample:
     message:
         """
-        Subsample all sequences into a {wildcards.subsample} set for build '{wildcards.build_name}' with {params.sequences_per_group} per {params.group_by}
+        Subsample all sequences by '{wildcards.subsample}' scheme for build '{wildcards.build_name}' with the following parameters:
+
+         - group by: {params.group_by}
+         - sequences per group: {params.sequences_per_group}
+         - subsample max sequences: {params.subsample_max_sequences}
+         - exclude: {params.exclude_argument}
+         - include: {params.include_argument}
+         - query: {params.query_argument}
+         - priority: {params.priority_argument}
         """
     input:
         sequences = rules.mask.output.alignment,

From 3c9a4d55c99f500752378f437d7d9de869a69ed6 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 31 Aug 2020 20:35:02 -0700
Subject: [PATCH 07/21] Use f-strings for subsampling limits instead of str
 function

Improves readability of code, generally, by using f-strings.
---
 workflow/snakemake_rules/main_workflow.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index ea9fa4aac..e58b1c75a 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -334,9 +334,9 @@ def _get_specific_subsampling_setting(setting, optional=False):
             # If is 'seq_per_group' or 'max_sequences' build subsampling setting,
             # need to return the 'argument' for augur
             if setting == 'seq_per_group':
-                value = "--sequences-per-group " + str(value)
+                value = f"--sequences-per-group {value}"
             elif setting == 'max_sequences':
-                value = "--subsample-max-sequences " + str(value)
+                value = f"--subsample-max-sequences {value}"
 
             return value
         else:

From de865bc010cd0aa78c1258fe72665e63610fdfce Mon Sep 17 00:00:00 2001
From: Emma Hodcroft <emmahodcroft@gmail.com>
Date: Tue, 1 Sep 2020 14:14:23 +0200
Subject: [PATCH 08/21] add max_sequences empty to example build files

---
 my_profiles/example/builds.yaml               |  9 +++++++++
 .../builds.yaml                               | 19 +++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml
index 8730a2cd9..916f6aa84 100644
--- a/my_profiles/example/builds.yaml
+++ b/my_profiles/example/builds.yaml
@@ -64,7 +64,14 @@ subsampling:
     # Focal samples for location
     focal:
       group_by: "year month"
+      # augur filter's `seq_per_group` and `max_sequences` are mutually
+      # exclusive, but one of these arguments is required for each subsampling
+      # scheme. To override these defaults in your own subsampling schemes of
+      # the same name, provide a value for the limit you want to use and an
+      # empty entry for the limit you don't use. The entry below demonstrates
+      # this approach.
       seq_per_group: 300
+      max_sequences:
       # Use augur filter's query argument to filter strains with pandas-style logic expressions.
       # This can be easier to read than the corresponding filter by exclusion.
       query: --query "(country == '{country}') & (division == '{division}') & (location == '{location}')"
@@ -73,6 +80,7 @@ subsampling:
     related:
       group_by: "country year month"
       seq_per_group: 20
+      max_sequences:
       exclude: "--exclude-where 'location={location}'"
       priorities:
         type: "proximity"
@@ -82,6 +90,7 @@ subsampling:
     contextual:
       group_by: "country year month"
       seq_per_group: 10
+      max_sequences:
       exclude: "--exclude-where 'location={location}'"
 
 # Here, you can specify what type of auspice_config you want to use
diff --git a/my_profiles/example_advanced_customization/builds.yaml b/my_profiles/example_advanced_customization/builds.yaml
index 76fceb79f..96dc7ab95 100644
--- a/my_profiles/example_advanced_customization/builds.yaml
+++ b/my_profiles/example_advanced_customization/builds.yaml
@@ -34,16 +34,24 @@ builds:
 
 subsampling:
   # Here we use the default subsampling logic for countries
+  # augur filter's `seq_per_group` and `max_sequences` are mutually
+  # exclusive, but one of these arguments is required for each subsampling
+  # scheme. To override these defaults in your own subsampling schemes of
+  # the same name, provide a value for the limit you want to use and an
+  # empty entry for the limit you don't use. The entry below demonstrates
+  # this approach.
   switzerland:
     # Focal samples for country
     country:
       group_by: "division year month"
       seq_per_group: 200
+      max_sequences:
       exclude: "--exclude-where 'country!={country}'"
     # Contextual samples from country's region
     region:
       group_by: "country year month"
       seq_per_group: 20
+      max_sequences:
       exclude: "--exclude-where 'country={country}' 'region!={region}'"
       priorities:
         type: "proximity"
@@ -53,6 +61,7 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 10
+      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -65,11 +74,13 @@ subsampling:
     division:
       group_by: "year month"
       seq_per_group: 300
+      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'"
     # Contextual samples from division's country
     country:
       group_by: "division year month"
       seq_per_group: 20
+      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'"
       priorities:
         type: "proximity"
@@ -78,6 +89,7 @@ subsampling:
     region:
       group_by: "country year month"
       seq_per_group: 10
+      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country={country}'"
       priorities:
         type: "proximity"
@@ -87,6 +99,7 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 5
+      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -99,26 +112,31 @@ subsampling:
     geneva:
       group_by: "year month"
       seq_per_group: 300
+      max_sequences:
       exclude: "--exclude-where 'division!=geneva'"
     vaud:
       group_by: "year month"
       seq_per_group: 300
+      max_sequences:
       exclude: "--exclude-where 'division!=vaud'"
     valais:
       group_by: "year month"
       seq_per_group: 300
+      max_sequences:
       exclude: "--exclude-where 'division!=valais'"
 
     # Contextual samples from the country
     country:
       group_by: "division year month"
       seq_per_group: 20
+      max_sequences:
       exclude: "--exclude-where 'country!=switzerland'"
 
     # Contextual samples from division's region
     region:
       group_by: "country year month"
       seq_per_group: 10
+      max_sequences:
       exclude: "--exclude-where 'region!=europe'"
       priorities:
         type: "proximity"
@@ -128,6 +146,7 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 5
+      max_sequences:
       exclude: "--exclude-where 'region=europe'"
       priorities:
         type: "proximity"

From 6c353dcf4f60ab4197000496b297b2b67acba98d Mon Sep 17 00:00:00 2001
From: Emma Hodcroft <emmahodcroft@gmail.com>
Date: Tue, 1 Sep 2020 14:26:54 +0200
Subject: [PATCH 09/21] add max_sequences example to file & docs

---
 docs/customizing-analysis.md                  | 50 ++++++++++++++++++-
 .../builds.yaml                               |  4 +-
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/docs/customizing-analysis.md b/docs/customizing-analysis.md
index 219072755..5b7e3c10f 100644
--- a/docs/customizing-analysis.md
+++ b/docs/customizing-analysis.md
@@ -62,10 +62,50 @@ We implement hierarchical subsampling by producing multiple samples at different
 A build can specify any number of such samples which can be flexibly restricted to particular meta data fields and subsampled from groups with particular properties.
 When specifying subsampling in this way, we'll first take sequences from the 'focal' area, and the select samples from other geographical areas.
 Read further for information on how we select these samples.
+Here, we'll look at the advanced example (`./my_profiles/example_advanced_customization`) file to explain some of the options.
 
-In this example, we'll look at a subsampling scheme which defines a `canton`.
+When specifying how many sequences you want in a subsampling level (for example, from a country or a region), you can do this using either `seq_per_group` or `max_sequences` - these work with the `group_by` argument.
+For example, `switzerland` subsampling rules in the advanced example looks like this:
+```yaml
+switzerland:
+  # Focal samples for country
+  country:
+    group_by: "division year month"
+    seq_per_group:
+    max_sequences: 1500
+    exclude: "--exclude-where 'country!={country}'"
+  # Contextual samples from country's region
+  region:
+    group_by: "country year month"
+    seq_per_group: 20
+    max_sequences:
+    exclude: "--exclude-where 'country={country}' 'region!={region}'"
+    priorities:
+      type: "proximity"
+      focus: "country"
+  # Contextual samples from the rest of the world,
+  # excluding the current region to avoid resampling.
+  global:
+    group_by: "country year month"
+    seq_per_group: 10
+    max_sequences:
+    exclude: "--exclude-where 'region={region}'"
+    priorities:
+      type: "proximity"
+      focus: "country"
+```
+
+For `country`-level sampling above, we specify that we want a maximum of 1,500 sequences from the country in question (here, Switzerland).
+Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total.
+
+Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20.
+This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20, all will be taken). 
+
+Since `max_sequences` and `seq_per_group` are mutually exclusive, you should supply a blank entry for whichever you don't want to use, as shown above.
+
+Now we'll look at a subsampling scheme which defines a multi-`canton` build.
 Cantons are regional divisions in Switzerland - below 'country,' but above 'location' (often city-level).
-In the advanced example (`./my_profiles/example_advanced_customization`), we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world.
+In the advanced example, we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world.
 
 For cantons this looks like this:
 ```yaml
@@ -76,26 +116,31 @@ lac-leman:
   geneva:
     group_by: "year month"
     seq_per_group: 300
+    max_sequences:
     exclude: "--exclude-where 'division!=geneva'"
   vaud:
     group_by: "year month"
     seq_per_group: 300
+    max_sequences:
     exclude: "--exclude-where 'division!=vaud'"
   valais:
     group_by: "year month"
     seq_per_group: 300
+    max_sequences:
     exclude: "--exclude-where 'division!=valais'"
 
   # Contextual samples from the country
   country:
     group_by: "division year month"
     seq_per_group: 20
+    max_sequences:
     exclude: "--exclude-where 'country!=switzerland'"
 
   # Contextual samples from division's region
   region:
     group_by: "country year month"
     seq_per_group: 10
+    max_sequences:
     exclude: "--exclude-where 'region!=europe'"
     priorities:
       type: "proximity"
@@ -105,6 +150,7 @@ lac-leman:
   global:
     group_by: "country year month"
     seq_per_group: 5
+    max_sequences:
     exclude: "--exclude-where 'region=europe'"
     priorities:
       type: "proximity"
diff --git a/my_profiles/example_advanced_customization/builds.yaml b/my_profiles/example_advanced_customization/builds.yaml
index 96dc7ab95..c914a1e6f 100644
--- a/my_profiles/example_advanced_customization/builds.yaml
+++ b/my_profiles/example_advanced_customization/builds.yaml
@@ -44,8 +44,8 @@ subsampling:
     # Focal samples for country
     country:
       group_by: "division year month"
-      seq_per_group: 200
-      max_sequences:
+      seq_per_group:
+      max_sequences: 1500
       exclude: "--exclude-where 'country!={country}'"
     # Contextual samples from country's region
     region:

From 6a6513e7ea0c4da3a38e85680536c2b04a3c66d3 Mon Sep 17 00:00:00 2001
From: Emma Hodcroft <emmahodcroft@gmail.com>
Date: Tue, 1 Sep 2020 16:41:30 +0200
Subject: [PATCH 10/21] give instructions in error for both max_seqs & seq_per

---
 workflow/snakemake_rules/main_workflow.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index e58b1c75a..6e9a523fd 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -287,7 +287,7 @@ def _get_subsampling_settings(wildcards):
     # an error instead of assuming the user prefers one setting over another by
     # default.
     if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
-        raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive.")
+        raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.")
 
     # If users have supplied neither `max_sequences` nor `seq_per_group`, we
     # throw an error because the subsampling rule will still group by one or

From 3b61c0923ee3a3aab9d71be9562ee84dcc921bee Mon Sep 17 00:00:00 2001
From: Emma Hodcroft <emmahodcroft@gmail.com>
Date: Tue, 1 Sep 2020 21:07:39 +0200
Subject: [PATCH 11/21] Update docs/customizing-analysis.md

Co-authored-by: Tony Tung <ttung@chanzuckerberg.com>
---
 docs/customizing-analysis.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/customizing-analysis.md b/docs/customizing-analysis.md
index 5b7e3c10f..c452f3c75 100644
--- a/docs/customizing-analysis.md
+++ b/docs/customizing-analysis.md
@@ -99,7 +99,7 @@ For `country`-level sampling above, we specify that we want a maximum of 1,500 s
 Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total.
 
 Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20.
-This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20, all will be taken). 
+This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken). 
 
 Since `max_sequences` and `seq_per_group` are mutually exclusive, you should supply a blank entry for whichever you don't want to use, as shown above.
 

From 6bab63646a81938b9709230db3a9265fea0367df Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 2 Sep 2020 09:03:55 -0700
Subject: [PATCH 12/21] Validate settings for a specific subsampling set when
 it's requested

The `_get_subsampling_settings` function returns either a complete subsampling
scheme (consistently of one or more sets of subsampling rules) or a specific
subsampling set. This return value depends on the presence of a `subsample`
wildcard. We only want to apply validation checks to the subsampling sets when
they have been specifically requested. This commit indents the original
validation logic by one level such that it only gets called when the current
Snakemake job has a `subsample` wildcard.
---
 workflow/snakemake_rules/main_workflow.smk | 24 +++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 6e9a523fd..8a5d823d0 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -283,18 +283,18 @@ def _get_subsampling_settings(wildcards):
     if hasattr(wildcards, "subsample"):
         subsampling_settings = subsampling_settings[wildcards.subsample]
 
-    # If users have supplied both `max_sequences` and `seq_per_group`, we throw
-    # an error instead of assuming the user prefers one setting over another by
-    # default.
-    if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
-        raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.")
-
-    # If users have supplied neither `max_sequences` nor `seq_per_group`, we
-    # throw an error because the subsampling rule will still group by one or
-    # more fields and the lack of limits on this grouping could produce
-    # unexpected behavior.
-    if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"):
-        raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.")
+        # If users have supplied both `max_sequences` and `seq_per_group`, we
+        # throw an error instead of assuming the user prefers one setting over
+        # another by default.
+        if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
+            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.")
+
+        # If users have supplied neither `max_sequences` nor `seq_per_group`, we
+        # throw an error because the subsampling rule will still group by one or
+        # more fields and the lack of limits on this grouping could produce
+        # unexpected behavior.
+        if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"):
+            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.")
 
     return subsampling_settings
 

From 4a058156839dd2614aa7c69e183f832e09eebce8 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 8 Sep 2020 15:35:58 -0700
Subject: [PATCH 13/21] Warn users about conflicting subsampling schemes

Prints an informative warning when users have defined a subsampling scheme that
has the same name as one of the default schemes and different settings. The
warning lists all conflicting schemes, explains why the conflict is a problem,
provides a solution to avoid the conflicts, and alerts users that a future
release of the workflow will treat these conflicts as errors.

The use of the sleep function here is a workaround for the fact that Snakemake's
output will quickly scroll past any warning we provide. By sleeping for a few
seconds, users at least have a chance to see that there was a warning and can
scroll back to the read the whole message.
---
 Snakefile | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/Snakefile b/Snakefile
index a3ecfe07b..fa7ba894e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,11 +1,42 @@
+import copy
 from os import environ
 from socket import getfqdn
 from getpass import getuser
+from snakemake.logging import logger
 from snakemake.utils import validate
+import time
+
+# Store the user's configuration prior to loading defaults, so we can check for
+# reused subsampling scheme names in the user's config. We need to make a deep
+# copy because Snakemake will deep merge the subsampling dictionary later,
+# modifying the values of a reference or shallow copy.
+user_subsampling = copy.deepcopy(config.get("subsampling", {}))
 
 configfile: "defaults/parameters.yaml"
 validate(config, schema="workflow/schemas/config.schema.yaml")
 
+# Check for overlapping subsampling schemes in user and default
+# configurations. For now, issue a deprecation warning, so users know they
+# should rename their subsampling schemes. In the future, this reuse of the same
+# name will cause an error.
+subsampling_config = config.get("subsampling", {})
+overlapping_schemes = []
+for scheme_name, scheme in user_subsampling.items():
+    if scheme_name in subsampling_config and subsampling_config.get(scheme_name) != scheme:
+        overlapping_schemes.append(scheme_name)
+
+if len(overlapping_schemes) > 0:
+    logger.warning(f"WARNING: The following subsampling scheme(s) have the same name as a default scheme in this workflow but different definitions:")
+    logger.warning("")
+    for scheme in overlapping_schemes:
+        logger.warning(f"  - {scheme}")
+    logger.warning("")
+    logger.warning("  This means Snakemake will merge your scheme with the default scheme and may produce unexpected behavior.")
+    logger.warning(f"  To avoid errors in your workflow, rename your schemes with unique names (e.g., 'custom_{overlapping_schemes[0]}')")
+    logger.warning("  In future versions of this workflow, overlapping subsampling scheme names will produce an error.")
+    logger.warning("")
+    time.sleep(5)
+
 # default build if none specified in config
 if "builds" not in config:
     config["builds"] = {

From 538376e05316648c42f6bdd1c3a1b2d221bd1b0f Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 11 Sep 2020 14:03:21 -0700
Subject: [PATCH 14/21] Require latest version of augur (10.0.2) with BioPython
 bug fixes

Since we are upgrading augur to 10.*.* for this PR anyway, we might as well use
the latest version that includes bug fixes.
---
 workflow/envs/nextstrain.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
index 640a525cb..dd19dadd4 100644
--- a/workflow/envs/nextstrain.yaml
+++ b/workflow/envs/nextstrain.yaml
@@ -19,6 +19,6 @@ dependencies:
 - pip
 - pip:
   - awscli==1.18.45
-  - nextstrain-augur==10.0.0
+  - nextstrain-augur==10.0.2
   - nextstrain-cli==1.16.5
   - rethinkdb==2.3.0.post6

From 78e29d6d58a8b5a7c8a8602a57ae0af8aed4e158 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 14 Sep 2020 12:43:26 -0700
Subject: [PATCH 15/21] Add documentation for how config loading order occurs

This documentation is for our own team when maintaining this code in the future
and trying to understand why it works. There is also a good chance that
Snakemake will change its implementation in a way that breaks our current
logic. These comments should at least help us track down the changes.
---
 Snakefile | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index fa7ba894e..016e8d352 100644
--- a/Snakefile
+++ b/Snakefile
@@ -9,7 +9,23 @@ import time
 # Store the user's configuration prior to loading defaults, so we can check for
 # reused subsampling scheme names in the user's config. We need to make a deep
 # copy because Snakemake will deep merge the subsampling dictionary later,
-# modifying the values of a reference or shallow copy.
+# modifying the values of a reference or shallow copy. Note that this loading of
+# the user's config prior to the defaults below depends on the order Snakemake
+# loads its configfiles. Specifically, the order of config loading is:
+#
+# 1. First, configfile arguments are loaded and config is built from these [1].
+# 2. Then, config arguments are loaded and override existing config keys [2].
+# 3. Then, the Snakefile is parsed and configfile directive inside the Snakefile is processed [3].
+#    When configfile is loaded from the directive in the Snakefile, the config
+#    dictionary is deep merged with the files [4] from the externally provided
+#    config files. This is the only place the deep merge happens using the
+#    update_config function [5].
+#
+# [1] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L466-L471
+# [2] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L476-L477
+# [3] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L551-L553
+# [4] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/workflow.py#L1088-L1094
+# [5] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/utils.py#L455-L476
 user_subsampling = copy.deepcopy(config.get("subsampling", {}))
 
 configfile: "defaults/parameters.yaml"

From 01005591079c7ab7ea00daf892daea5c41f7948e Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 14 Sep 2020 12:45:36 -0700
Subject: [PATCH 16/21] Remove unnecessary empty subsampling settings from
 example profiles

---
 defaults/parameters.yaml                      | 21 -------------------
 .../builds.yaml                               | 21 +------------------
 2 files changed, 1 insertion(+), 41 deletions(-)

diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
index e89f2292c..7731d9376 100644
--- a/defaults/parameters.yaml
+++ b/defaults/parameters.yaml
@@ -122,20 +122,12 @@ subsampling:
     # Focal samples for region
     region:
       group_by: "division year month"
-      # augur filter's `seq_per_group` and `max_sequences` are mutually
-      # exclusive, but one of these arguments is required for each subsampling
-      # scheme. To override these defaults in your own subsampling schemes of
-      # the same name, provide a value for the limit you want to use and an
-      # empty entry for the limit you don't use. The entry below demonstrates
-      # this approach.
       seq_per_group: 48
-      max_sequences:
       exclude: "--exclude-where 'region!={region}'"
     # Contextual samples for region from the rest of the world
     global:
       group_by: "country year month"
       seq_per_group: 4
-      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -146,7 +138,6 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 20
-      max_sequences:
 
   # Custom subsampling logic for regions like Europe where grouping by country
   # is the smallest resolution required.
@@ -155,13 +146,11 @@ subsampling:
     region:
       group_by: "country year month"
       seq_per_group: 64
-      max_sequences:
       exclude: "--exclude-where 'region!={region}'"
     # Contextual samples for region from the rest of the world
     global:
       group_by: "country year month"
       seq_per_group: 6
-      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -173,13 +162,11 @@ subsampling:
     country:
       group_by: "division year month"
       seq_per_group: 200
-      max_sequences:
       exclude: "--exclude-where 'country!={country}'"
     # Contextual samples from country's region
     region:
       group_by: "country year month"
       seq_per_group: 20
-      max_sequences:
       exclude: "--exclude-where 'country={country}' 'region!={region}'"
       priorities:
         type: "proximity"
@@ -189,7 +176,6 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 10
-      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -201,13 +187,11 @@ subsampling:
     division:
       group_by: "year month"
       seq_per_group: 300
-      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'"
     # Contextual samples from division's country
     country:
       group_by: "division year month"
       seq_per_group: 20
-      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'"
       priorities:
         type: "proximity"
@@ -216,7 +200,6 @@ subsampling:
     region:
       group_by: "country year month"
       seq_per_group: 10
-      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country={country}'"
       priorities:
         type: "proximity"
@@ -226,7 +209,6 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 5
-      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -238,13 +220,11 @@ subsampling:
     focal:
       group_by: "year month"
       seq_per_group: 300
-      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}' 'location!={location}'"
     # Samples that are genetically related to the focal samples
     related:
       group_by: "country year month"
       seq_per_group: 20
-      max_sequences:
       exclude: "--exclude-where 'location={location}'"
       priorities:
         type: "proximity"
@@ -253,5 +233,4 @@ subsampling:
     contextual:
       group_by: "country year month"
       seq_per_group: 10
-      max_sequences:
       exclude: "--exclude-where 'location={location}'"
diff --git a/my_profiles/example_advanced_customization/builds.yaml b/my_profiles/example_advanced_customization/builds.yaml
index c914a1e6f..f60669423 100644
--- a/my_profiles/example_advanced_customization/builds.yaml
+++ b/my_profiles/example_advanced_customization/builds.yaml
@@ -34,24 +34,16 @@ builds:
 
 subsampling:
   # Here we use the default subsampling logic for countries
-  # augur filter's `seq_per_group` and `max_sequences` are mutually
-  # exclusive, but one of these arguments is required for each subsampling
-  # scheme. To override these defaults in your own subsampling schemes of
-  # the same name, provide a value for the limit you want to use and an
-  # empty entry for the limit you don't use. The entry below demonstrates
-  # this approach.
   switzerland:
     # Focal samples for country
     country:
       group_by: "division year month"
-      seq_per_group:
       max_sequences: 1500
       exclude: "--exclude-where 'country!={country}'"
     # Contextual samples from country's region
     region:
       group_by: "country year month"
       seq_per_group: 20
-      max_sequences:
       exclude: "--exclude-where 'country={country}' 'region!={region}'"
       priorities:
         type: "proximity"
@@ -61,7 +53,6 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 10
-      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
@@ -74,13 +65,11 @@ subsampling:
     division:
       group_by: "year month"
       seq_per_group: 300
-      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'"
     # Contextual samples from division's country
     country:
       group_by: "division year month"
       seq_per_group: 20
-      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'"
       priorities:
         type: "proximity"
@@ -89,7 +78,6 @@ subsampling:
     region:
       group_by: "country year month"
       seq_per_group: 10
-      max_sequences:
       exclude: "--exclude-where 'region!={region}' 'country={country}'"
       priorities:
         type: "proximity"
@@ -99,44 +87,38 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 5
-      max_sequences:
       exclude: "--exclude-where 'region={region}'"
       priorities:
         type: "proximity"
         focus: "division"
 
   # This build will take from 3 cantons - we have a sample rule for each,
-  # rather than just one division that's focal build 
+  # rather than just one division that's focal build
   lac-leman:
     # focal samples
     geneva:
       group_by: "year month"
       seq_per_group: 300
-      max_sequences:
       exclude: "--exclude-where 'division!=geneva'"
     vaud:
       group_by: "year month"
       seq_per_group: 300
-      max_sequences:
       exclude: "--exclude-where 'division!=vaud'"
     valais:
       group_by: "year month"
       seq_per_group: 300
-      max_sequences:
       exclude: "--exclude-where 'division!=valais'"
 
     # Contextual samples from the country
     country:
       group_by: "division year month"
       seq_per_group: 20
-      max_sequences:
       exclude: "--exclude-where 'country!=switzerland'"
 
     # Contextual samples from division's region
     region:
       group_by: "country year month"
       seq_per_group: 10
-      max_sequences:
       exclude: "--exclude-where 'region!=europe'"
       priorities:
         type: "proximity"
@@ -146,7 +128,6 @@ subsampling:
     global:
       group_by: "country year month"
       seq_per_group: 5
-      max_sequences:
       exclude: "--exclude-where 'region=europe'"
       priorities:
         type: "proximity"

From 91f95b3f7d82f6d7a53bd32ba73793e4a4910d85 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 14 Sep 2020 12:46:15 -0700
Subject: [PATCH 17/21] Use a unique subsampling scheme name for the example
 profile

The original county-level subsampling scheme for the example profile used the
same name as the default "county" scheme we provide. This commit renames that
scheme to "custom-county" to avoid conflicts with the default schemes.
---
 my_profiles/example/builds.yaml | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml
index 916f6aa84..eea133e11 100644
--- a/my_profiles/example/builds.yaml
+++ b/my_profiles/example/builds.yaml
@@ -16,7 +16,7 @@ builds:
   # with a build name that will produce the following URL fragment on Nextstrain/auspice:
   # /ncov/north-america/usa/washington/king-county
   north-america_usa_washington_king-county: # name of the build; this can be anything
-    subsampling_scheme: county # use a custom subsampling scheme defined below
+    subsampling_scheme: custom-county # use a custom subsampling scheme defined below
     region: North America
     country: USA
     division: Washington
@@ -60,18 +60,11 @@ builds:
 
 # Define custom subsampling logic for county-level builds.
 subsampling:
-  county:
+  custom-county:
     # Focal samples for location
     focal:
       group_by: "year month"
-      # augur filter's `seq_per_group` and `max_sequences` are mutually
-      # exclusive, but one of these arguments is required for each subsampling
-      # scheme. To override these defaults in your own subsampling schemes of
-      # the same name, provide a value for the limit you want to use and an
-      # empty entry for the limit you don't use. The entry below demonstrates
-      # this approach.
       seq_per_group: 300
-      max_sequences:
       # Use augur filter's query argument to filter strains with pandas-style logic expressions.
       # This can be easier to read than the corresponding filter by exclusion.
       query: --query "(country == '{country}') & (division == '{division}') & (location == '{location}')"
@@ -80,7 +73,6 @@ subsampling:
     related:
       group_by: "country year month"
       seq_per_group: 20
-      max_sequences:
       exclude: "--exclude-where 'location={location}'"
       priorities:
         type: "proximity"
@@ -90,7 +82,6 @@ subsampling:
     contextual:
       group_by: "country year month"
       seq_per_group: 10
-      max_sequences:
       exclude: "--exclude-where 'location={location}'"
 
 # Here, you can specify what type of auspice_config you want to use

From 1f531cd6b11cebccadde3a6ff35fdebf1408cfa8 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 14 Sep 2020 12:48:03 -0700
Subject: [PATCH 18/21] Print a better explanation for how to resolve
 conflicting subsamples

Instead of recommending that users enter an empty subsampling setting when a
conflict occurs, we ask them to rename their conflicting subsampling scheme to
avoid any future conflicts and unexpected behavior.
---
 workflow/snakemake_rules/main_workflow.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 8a5d823d0..829f3ece3 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -287,7 +287,7 @@ def _get_subsampling_settings(wildcards):
         # throw an error instead of assuming the user prefers one setting over
         # another by default.
         if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
-            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.")
+            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be caused by using the same subsampling scheme name as a default scheme. If so, try renaming your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}').")
 
         # If users have supplied neither `max_sequences` nor `seq_per_group`, we
         # throw an error because the subsampling rule will still group by one or

From ded0e42171d255811322cd8c82acafcacf9b1a59 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 14 Sep 2020 12:56:09 -0700
Subject: [PATCH 19/21] Remove references to empty subsampling attributes from
 tutorial

---
 docs/customizing-analysis.md | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/docs/customizing-analysis.md b/docs/customizing-analysis.md
index c452f3c75..bbace2918 100644
--- a/docs/customizing-analysis.md
+++ b/docs/customizing-analysis.md
@@ -71,14 +71,12 @@ switzerland:
   # Focal samples for country
   country:
     group_by: "division year month"
-    seq_per_group:
     max_sequences: 1500
     exclude: "--exclude-where 'country!={country}'"
   # Contextual samples from country's region
   region:
     group_by: "country year month"
     seq_per_group: 20
-    max_sequences:
     exclude: "--exclude-where 'country={country}' 'region!={region}'"
     priorities:
       type: "proximity"
@@ -88,7 +86,6 @@ switzerland:
   global:
     group_by: "country year month"
     seq_per_group: 10
-    max_sequences:
     exclude: "--exclude-where 'region={region}'"
     priorities:
       type: "proximity"
@@ -99,9 +96,7 @@ For `country`-level sampling above, we specify that we want a maximum of 1,500 s
 Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total.
 
 Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20.
-This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken). 
-
-Since `max_sequences` and `seq_per_group` are mutually exclusive, you should supply a blank entry for whichever you don't want to use, as shown above.
+This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken).
 
 Now we'll look at a subsampling scheme which defines a multi-`canton` build.
 Cantons are regional divisions in Switzerland - below 'country,' but above 'location' (often city-level).
@@ -116,31 +111,26 @@ lac-leman:
   geneva:
     group_by: "year month"
     seq_per_group: 300
-    max_sequences:
     exclude: "--exclude-where 'division!=geneva'"
   vaud:
     group_by: "year month"
     seq_per_group: 300
-    max_sequences:
     exclude: "--exclude-where 'division!=vaud'"
   valais:
     group_by: "year month"
     seq_per_group: 300
-    max_sequences:
     exclude: "--exclude-where 'division!=valais'"
 
   # Contextual samples from the country
   country:
     group_by: "division year month"
     seq_per_group: 20
-    max_sequences:
     exclude: "--exclude-where 'country!=switzerland'"
 
   # Contextual samples from division's region
   region:
     group_by: "country year month"
     seq_per_group: 10
-    max_sequences:
     exclude: "--exclude-where 'region!=europe'"
     priorities:
       type: "proximity"
@@ -150,7 +140,6 @@ lac-leman:
   global:
     group_by: "country year month"
     seq_per_group: 5
-    max_sequences:
     exclude: "--exclude-where 'region=europe'"
     priorities:
       type: "proximity"

From be3984150cec6d1f4069074d41982af07f8636dc Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 14 Sep 2020 13:08:11 -0700
Subject: [PATCH 20/21] Refine error message when user has a subsampling scheme
 conflict

This message asks the user to first check whether they defined conflicting
attributes before checking whether they have a conflicting subsampling scheme
name.
---
 workflow/snakemake_rules/main_workflow.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 829f3ece3..c0692d1d0 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -287,7 +287,7 @@ def _get_subsampling_settings(wildcards):
         # throw an error instead of assuming the user prefers one setting over
         # another by default.
         if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"):
-            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be caused by using the same subsampling scheme name as a default scheme. If so, try renaming your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}').")
+            raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. If you didn't define both of these settings, this conflict could be caused by using the same subsampling scheme name as a default scheme. In this case, rename your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}') and run the workflow again.")
 
         # If users have supplied neither `max_sequences` nor `seq_per_group`, we
         # throw an error because the subsampling rule will still group by one or

From 5a44a9792eb05fa5b8f7a002cc30c8b6cbb074ec Mon Sep 17 00:00:00 2001
From: Emma Hodcroft <emmahodcroft@gmail.com>
Date: Tue, 15 Sep 2020 17:20:33 +0200
Subject: [PATCH 21/21] rename nextstrain profiles not to clash with defaults

---
 nextstrain_profiles/nextstrain/builds.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/nextstrain_profiles/nextstrain/builds.yaml b/nextstrain_profiles/nextstrain/builds.yaml
index 27052d345..729872544 100644
--- a/nextstrain_profiles/nextstrain/builds.yaml
+++ b/nextstrain_profiles/nextstrain/builds.yaml
@@ -8,30 +8,30 @@ custom_rules:
 # (They override the defaults)
 builds:
   global:
-    subsampling_scheme: region_global
+    subsampling_scheme: nextstrain_region_global
     auspice_config: "nextstrain_profiles/nextstrain/global_auspice_config.json"
   africa:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: Africa
     auspice_config: "nextstrain_profiles/nextstrain/africa_auspice_config.json"
   asia:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: Asia
     auspice_config: "nextstrain_profiles/nextstrain/asia_auspice_config.json"
   europe:
-    subsampling_scheme: region_grouped_by_country
+    subsampling_scheme: nextstrain_region_grouped_by_country
     region: Europe
     auspice_config: "nextstrain_profiles/nextstrain/europe_auspice_config.json"
   north-america:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: North America
     auspice_config: "nextstrain_profiles/nextstrain/north-america_auspice_config.json"
   oceania:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: Oceania
     auspice_config: "nextstrain_profiles/nextstrain/oceania_auspice_config.json"
   south-america:
-    subsampling_scheme: region
+    subsampling_scheme: nextstrain_region
     region: South America
     auspice_config: "nextstrain_profiles/nextstrain/south-america_auspice_config.json"
 
@@ -105,7 +105,7 @@ slack_channel: "#ncov-gisaid-updates"
 
 subsampling:
   # Custom subsampling logic for regions
-  region:
+  nextstrain_region:
     # Focal samples for region
     region:
       group_by: "division year month"
@@ -121,14 +121,14 @@ subsampling:
         focus: "region"
 
   # Custom subsampling logic for global region.
-  region_global:
+  nextstrain_region_global:
     global:
       group_by: "country year month"
       seq_per_group: 16
 
   # Custom subsampling for regions like Europe where grouping by country
   # is the smallest resolution requied
-  region_grouped_by_country:
+  nextstrain_region_grouped_by_country:
     # Focal samples for region
     region:
       group_by: "country year month"