From ad980d4304445a1c0329915678aabefb6767ef69 Mon Sep 17 00:00:00 2001 From: Emma Hodcroft Date: Fri, 21 Aug 2020 16:44:32 +0200 Subject: [PATCH 01/21] allow use of --subsample-max-sequences filter arg --- workflow/snakemake_rules/main_workflow.smk | 23 +++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 260812226..f52ae25a2 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -281,9 +281,14 @@ def _get_subsampling_settings(wildcards): subsampling_settings = config["subsampling"][subsampling_scheme] if hasattr(wildcards, "subsample"): - return subsampling_settings[wildcards.subsample] - else: - return subsampling_settings + subsampling_settings = subsampling_settings[wildcards.subsample] + + # If users have supplied 'max_sequences' argument, assume they wish to override + # our default 'seq_per_group' from 'parameters.yaml' - so remove the 'seq_per_group' + if 'max_sequences' in subsampling_settings and 'seq_per_group' in subsampling_settings: + subsampling_settings.pop('seq_per_group') + + return subsampling_settings def get_priorities(wildcards): @@ -318,6 +323,12 @@ def _get_specific_subsampling_setting(setting, optional=False): build = config["builds"][wildcards.build_name] value = value.format(**build) else: + # If is 'seq_per_group' or 'max_sequences' build subsampling setting, + # need to return the 'argument' for augur + if setting == 'seq_per_group': + value = "--sequences-per-group " + str(value) + elif setting == 'max_sequences': + value = "--subsample-max-sequences " + str(value) return value # Check format strings that haven't been resolved. @@ -342,7 +353,8 @@ rule subsample: sequences = "results/{build_name}/sample-{subsample}.fasta" params: group_by = _get_specific_subsampling_setting("group_by"), - sequences_per_group = _get_specific_subsampling_setting("seq_per_group"), + sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True), + subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True), exclude_argument = _get_specific_subsampling_setting("exclude", optional=True), include_argument = _get_specific_subsampling_setting("include", optional=True), query_argument = _get_specific_subsampling_setting("query", optional=True), @@ -359,7 +371,8 @@ rule subsample: {params.query_argument} \ {params.priority_argument} \ --group-by {params.group_by} \ - --sequences-per-group {params.sequences_per_group} \ + {params.sequences_per_group} \ + {params.subsample_max_sequences} \ --output {output.sequences} 2>&1 | tee {log} """ From a4875dd30f0c83ba57a5095387e3601b67900140 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 31 Aug 2020 19:38:23 -0700 Subject: [PATCH 02/21] Require augur 10.0.0 in conda environments Adds support for augur filter's max sequences argument introduced in version 10.0.0 and used by the ncov workflow. --- workflow/envs/nextstrain.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml index 7ac5e3704..640a525cb 100644 --- a/workflow/envs/nextstrain.yaml +++ b/workflow/envs/nextstrain.yaml @@ -19,6 +19,6 @@ dependencies: - pip - pip: - awscli==1.18.45 - - nextstrain-augur==9.0.0 + - nextstrain-augur==10.0.0 - nextstrain-cli==1.16.5 - rethinkdb==2.3.0.post6 From 76234e0cd7b5f80cb820a71f0aceaeeaf63e3dce Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 31 Aug 2020 20:08:10 -0700 Subject: [PATCH 03/21] Throw errors for improperly defined mutually exclusive/required args Produces an informative error message when the user either: - defines both `sequences_per_group` and `subsample_max_sequences` - defines neither `sequences_per_group` nor `subsample_max_sequences` The first error addresses the mutually exclusive nature of the two arguments and alerts the user that they need to change a specific subsampling scheme to fix the issue. The second error addresses the required natures of the arguments and alerts the user that they need to define one of the two subsampling limits. --- workflow/snakemake_rules/main_workflow.smk | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index f52ae25a2..99c212028 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -282,11 +282,19 @@ def _get_subsampling_settings(wildcards): if hasattr(wildcards, "subsample"): subsampling_settings = subsampling_settings[wildcards.subsample] - - # If users have supplied 'max_sequences' argument, assume they wish to override - # our default 'seq_per_group' from 'parameters.yaml' - so remove the 'seq_per_group' - if 'max_sequences' in subsampling_settings and 'seq_per_group' in subsampling_settings: - subsampling_settings.pop('seq_per_group') + + # If users have supplied both `max_sequences` and `seq_per_group`, we throw + # an error instead of assuming the user prefers one setting over another by + # default. + if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"): + raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive.") + + # If users have supplied neither `max_sequences` nor `seq_per_group`, we + # throw an error because the subsampling rule will still group by one or + # more fields and the lack of limits on this grouping could produce + # unexpected behavior. + if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"): + raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.") return subsampling_settings From e5d401896957374fcdca0a6fd582ec99ca54f5a4 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 31 Aug 2020 20:11:14 -0700 Subject: [PATCH 04/21] Allow users to define empty keys in subsampling schemes Checks whether subsampling values are explicitly undefined (equal to None) and only attempts to process those values that are not None. If a None value is passed, we convert it to an empty string, so it does not appear in the subsampling shell command. These changes allow users to explicitly state that they do not define a specific subsampling key. This functionality is important because of the way Snakemake's config data structures are deep merged. It is possible for the default values of a subsampling scheme to conflict with a user's own custom subsampling scheme of the same name. To avoid conflicts of mutually exclusive keys, users can override the default keys with empty values. --- workflow/snakemake_rules/main_workflow.smk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 99c212028..5ce12e200 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -330,14 +330,17 @@ def _get_specific_subsampling_setting(setting, optional=False): # build's region, country, division, etc. as needed for subsampling. build = config["builds"][wildcards.build_name] value = value.format(**build) - else: + elif value is not None: # If is 'seq_per_group' or 'max_sequences' build subsampling setting, # need to return the 'argument' for augur if setting == 'seq_per_group': value = "--sequences-per-group " + str(value) elif setting == 'max_sequences': value = "--subsample-max-sequences " + str(value) + return value + else: + value = "" # Check format strings that haven't been resolved. if re.search(r'\{.+\}', value): From 737513dba2f39a02f079b87b9eca96de38663752 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 31 Aug 2020 20:21:05 -0700 Subject: [PATCH 05/21] Explicitly define max_sequence entries to surface mutual exclusion Adds an empty `max_sequences` key to our default subsampling rules and adds comments to the first pair of keys in the default YAMLs to describe their required and mutually exclusive behavior. --- defaults/parameters.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 7731d9376..e89f2292c 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -122,12 +122,20 @@ subsampling: # Focal samples for region region: group_by: "division year month" + # augur filter's `seq_per_group` and `max_sequences` are mutually + # exclusive, but one of these arguments is required for each subsampling + # scheme. To override these defaults in your own subsampling schemes of + # the same name, provide a value for the limit you want to use and an + # empty entry for the limit you don't use. The entry below demonstrates + # this approach. seq_per_group: 48 + max_sequences: exclude: "--exclude-where 'region!={region}'" # Contextual samples for region from the rest of the world global: group_by: "country year month" seq_per_group: 4 + max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -138,6 +146,7 @@ subsampling: global: group_by: "country year month" seq_per_group: 20 + max_sequences: # Custom subsampling logic for regions like Europe where grouping by country # is the smallest resolution required. @@ -146,11 +155,13 @@ subsampling: region: group_by: "country year month" seq_per_group: 64 + max_sequences: exclude: "--exclude-where 'region!={region}'" # Contextual samples for region from the rest of the world global: group_by: "country year month" seq_per_group: 6 + max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -162,11 +173,13 @@ subsampling: country: group_by: "division year month" seq_per_group: 200 + max_sequences: exclude: "--exclude-where 'country!={country}'" # Contextual samples from country's region region: group_by: "country year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'country={country}' 'region!={region}'" priorities: type: "proximity" @@ -176,6 +189,7 @@ subsampling: global: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -187,11 +201,13 @@ subsampling: division: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'" # Contextual samples from division's country country: group_by: "division year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'" priorities: type: "proximity" @@ -200,6 +216,7 @@ subsampling: region: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'region!={region}' 'country={country}'" priorities: type: "proximity" @@ -209,6 +226,7 @@ subsampling: global: group_by: "country year month" seq_per_group: 5 + max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -220,11 +238,13 @@ subsampling: focal: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}' 'location!={location}'" # Samples that are genetically related to the focal samples related: group_by: "country year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'location={location}'" priorities: type: "proximity" @@ -233,4 +253,5 @@ subsampling: contextual: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'location={location}'" From 5440eb482d3f5a929bd422e47bfe638888a56591 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 31 Aug 2020 20:32:05 -0700 Subject: [PATCH 06/21] Update the message for subsample rule to reflect different arguments Clarifies that the subsample wildcard refers to a specific named subsampling scheme and explicitly lists all of the parameters used by a given subsampling rule. This approach errs on the side of verbosity to provide a better sense of what each subsampling rule is actually doing than looking at the augur filter command itself. --- workflow/snakemake_rules/main_workflow.smk | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 5ce12e200..ea9fa4aac 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -353,7 +353,15 @@ def _get_specific_subsampling_setting(setting, optional=False): rule subsample: message: """ - Subsample all sequences into a {wildcards.subsample} set for build '{wildcards.build_name}' with {params.sequences_per_group} per {params.group_by} + Subsample all sequences by '{wildcards.subsample}' scheme for build '{wildcards.build_name}' with the following parameters: + + - group by: {params.group_by} + - sequences per group: {params.sequences_per_group} + - subsample max sequences: {params.subsample_max_sequences} + - exclude: {params.exclude_argument} + - include: {params.include_argument} + - query: {params.query_argument} + - priority: {params.priority_argument} """ input: sequences = rules.mask.output.alignment, From 3c9a4d55c99f500752378f437d7d9de869a69ed6 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 31 Aug 2020 20:35:02 -0700 Subject: [PATCH 07/21] Use f-strings for subsampling limits instead of str function Improves readability of code, generally, by using f-strings. --- workflow/snakemake_rules/main_workflow.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index ea9fa4aac..e58b1c75a 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -334,9 +334,9 @@ def _get_specific_subsampling_setting(setting, optional=False): # If is 'seq_per_group' or 'max_sequences' build subsampling setting, # need to return the 'argument' for augur if setting == 'seq_per_group': - value = "--sequences-per-group " + str(value) + value = f"--sequences-per-group {value}" elif setting == 'max_sequences': - value = "--subsample-max-sequences " + str(value) + value = f"--subsample-max-sequences {value}" return value else: From de865bc010cd0aa78c1258fe72665e63610fdfce Mon Sep 17 00:00:00 2001 From: Emma Hodcroft Date: Tue, 1 Sep 2020 14:14:23 +0200 Subject: [PATCH 08/21] add max_sequences empty to example build files --- my_profiles/example/builds.yaml | 9 +++++++++ .../builds.yaml | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml index 8730a2cd9..916f6aa84 100644 --- a/my_profiles/example/builds.yaml +++ b/my_profiles/example/builds.yaml @@ -64,7 +64,14 @@ subsampling: # Focal samples for location focal: group_by: "year month" + # augur filter's `seq_per_group` and `max_sequences` are mutually + # exclusive, but one of these arguments is required for each subsampling + # scheme. To override these defaults in your own subsampling schemes of + # the same name, provide a value for the limit you want to use and an + # empty entry for the limit you don't use. The entry below demonstrates + # this approach. seq_per_group: 300 + max_sequences: # Use augur filter's query argument to filter strains with pandas-style logic expressions. # This can be easier to read than the corresponding filter by exclusion. query: --query "(country == '{country}') & (division == '{division}') & (location == '{location}')" @@ -73,6 +80,7 @@ subsampling: related: group_by: "country year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'location={location}'" priorities: type: "proximity" @@ -82,6 +90,7 @@ subsampling: contextual: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'location={location}'" # Here, you can specify what type of auspice_config you want to use diff --git a/my_profiles/example_advanced_customization/builds.yaml b/my_profiles/example_advanced_customization/builds.yaml index 76fceb79f..96dc7ab95 100644 --- a/my_profiles/example_advanced_customization/builds.yaml +++ b/my_profiles/example_advanced_customization/builds.yaml @@ -34,16 +34,24 @@ builds: subsampling: # Here we use the default subsampling logic for countries + # augur filter's `seq_per_group` and `max_sequences` are mutually + # exclusive, but one of these arguments is required for each subsampling + # scheme. To override these defaults in your own subsampling schemes of + # the same name, provide a value for the limit you want to use and an + # empty entry for the limit you don't use. The entry below demonstrates + # this approach. switzerland: # Focal samples for country country: group_by: "division year month" seq_per_group: 200 + max_sequences: exclude: "--exclude-where 'country!={country}'" # Contextual samples from country's region region: group_by: "country year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'country={country}' 'region!={region}'" priorities: type: "proximity" @@ -53,6 +61,7 @@ subsampling: global: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -65,11 +74,13 @@ subsampling: division: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'" # Contextual samples from division's country country: group_by: "division year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'" priorities: type: "proximity" @@ -78,6 +89,7 @@ subsampling: region: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'region!={region}' 'country={country}'" priorities: type: "proximity" @@ -87,6 +99,7 @@ subsampling: global: group_by: "country year month" seq_per_group: 5 + max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -99,26 +112,31 @@ subsampling: geneva: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'division!=geneva'" vaud: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'division!=vaud'" valais: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'division!=valais'" # Contextual samples from the country country: group_by: "division year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'country!=switzerland'" # Contextual samples from division's region region: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'region!=europe'" priorities: type: "proximity" @@ -128,6 +146,7 @@ subsampling: global: group_by: "country year month" seq_per_group: 5 + max_sequences: exclude: "--exclude-where 'region=europe'" priorities: type: "proximity" From 6c353dcf4f60ab4197000496b297b2b67acba98d Mon Sep 17 00:00:00 2001 From: Emma Hodcroft Date: Tue, 1 Sep 2020 14:26:54 +0200 Subject: [PATCH 09/21] add max_sequences example to file & docs --- docs/customizing-analysis.md | 50 ++++++++++++++++++- .../builds.yaml | 4 +- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/docs/customizing-analysis.md b/docs/customizing-analysis.md index 219072755..5b7e3c10f 100644 --- a/docs/customizing-analysis.md +++ b/docs/customizing-analysis.md @@ -62,10 +62,50 @@ We implement hierarchical subsampling by producing multiple samples at different A build can specify any number of such samples which can be flexibly restricted to particular meta data fields and subsampled from groups with particular properties. When specifying subsampling in this way, we'll first take sequences from the 'focal' area, and the select samples from other geographical areas. Read further for information on how we select these samples. +Here, we'll look at the advanced example (`./my_profiles/example_advanced_customization`) file to explain some of the options. -In this example, we'll look at a subsampling scheme which defines a `canton`. +When specifying how many sequences you want in a subsampling level (for example, from a country or a region), you can do this using either `seq_per_group` or `max_sequences` - these work with the `group_by` argument. +For example, `switzerland` subsampling rules in the advanced example looks like this: +```yaml +switzerland: + # Focal samples for country + country: + group_by: "division year month" + seq_per_group: + max_sequences: 1500 + exclude: "--exclude-where 'country!={country}'" + # Contextual samples from country's region + region: + group_by: "country year month" + seq_per_group: 20 + max_sequences: + exclude: "--exclude-where 'country={country}' 'region!={region}'" + priorities: + type: "proximity" + focus: "country" + # Contextual samples from the rest of the world, + # excluding the current region to avoid resampling. + global: + group_by: "country year month" + seq_per_group: 10 + max_sequences: + exclude: "--exclude-where 'region={region}'" + priorities: + type: "proximity" + focus: "country" +``` + +For `country`-level sampling above, we specify that we want a maximum of 1,500 sequences from the country in question (here, Switzerland). +Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total. + +Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20. +This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20, all will be taken). + +Since `max_sequences` and `seq_per_group` are mutually exclusive, you should supply a blank entry for whichever you don't want to use, as shown above. + +Now we'll look at a subsampling scheme which defines a multi-`canton` build. Cantons are regional divisions in Switzerland - below 'country,' but above 'location' (often city-level). -In the advanced example (`./my_profiles/example_advanced_customization`), we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world. +In the advanced example, we'd like to be able to specify a set of neighboring 'cantons' and do focal sampling there, with contextual samples from elsewhere in the country, other countries in the region, and other regions in the world. For cantons this looks like this: ```yaml @@ -76,26 +116,31 @@ lac-leman: geneva: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'division!=geneva'" vaud: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'division!=vaud'" valais: group_by: "year month" seq_per_group: 300 + max_sequences: exclude: "--exclude-where 'division!=valais'" # Contextual samples from the country country: group_by: "division year month" seq_per_group: 20 + max_sequences: exclude: "--exclude-where 'country!=switzerland'" # Contextual samples from division's region region: group_by: "country year month" seq_per_group: 10 + max_sequences: exclude: "--exclude-where 'region!=europe'" priorities: type: "proximity" @@ -105,6 +150,7 @@ lac-leman: global: group_by: "country year month" seq_per_group: 5 + max_sequences: exclude: "--exclude-where 'region=europe'" priorities: type: "proximity" diff --git a/my_profiles/example_advanced_customization/builds.yaml b/my_profiles/example_advanced_customization/builds.yaml index 96dc7ab95..c914a1e6f 100644 --- a/my_profiles/example_advanced_customization/builds.yaml +++ b/my_profiles/example_advanced_customization/builds.yaml @@ -44,8 +44,8 @@ subsampling: # Focal samples for country country: group_by: "division year month" - seq_per_group: 200 - max_sequences: + seq_per_group: + max_sequences: 1500 exclude: "--exclude-where 'country!={country}'" # Contextual samples from country's region region: From 6a6513e7ea0c4da3a38e85680536c2b04a3c66d3 Mon Sep 17 00:00:00 2001 From: Emma Hodcroft Date: Tue, 1 Sep 2020 16:41:30 +0200 Subject: [PATCH 10/21] give instructions in error for both max_seqs & seq_per --- workflow/snakemake_rules/main_workflow.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index e58b1c75a..6e9a523fd 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -287,7 +287,7 @@ def _get_subsampling_settings(wildcards): # an error instead of assuming the user prefers one setting over another by # default. if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"): - raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive.") + raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.") # If users have supplied neither `max_sequences` nor `seq_per_group`, we # throw an error because the subsampling rule will still group by one or From 3b61c0923ee3a3aab9d71be9562ee84dcc921bee Mon Sep 17 00:00:00 2001 From: Emma Hodcroft Date: Tue, 1 Sep 2020 21:07:39 +0200 Subject: [PATCH 11/21] Update docs/customizing-analysis.md Co-authored-by: Tony Tung --- docs/customizing-analysis.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/customizing-analysis.md b/docs/customizing-analysis.md index 5b7e3c10f..c452f3c75 100644 --- a/docs/customizing-analysis.md +++ b/docs/customizing-analysis.md @@ -99,7 +99,7 @@ For `country`-level sampling above, we specify that we want a maximum of 1,500 s Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total. Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20. -This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20, all will be taken). +This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken). Since `max_sequences` and `seq_per_group` are mutually exclusive, you should supply a blank entry for whichever you don't want to use, as shown above. From 6bab63646a81938b9709230db3a9265fea0367df Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Wed, 2 Sep 2020 09:03:55 -0700 Subject: [PATCH 12/21] Validate settings for a specific subsampling set when it's requested The `_get_subsampling_settings` function returns either a complete subsampling scheme (consistently of one or more sets of subsampling rules) or a specific subsampling set. This return value depends on the presence of a `subsample` wildcard. We only want to apply validation checks to the subsampling sets when they have been specifically requested. This commit indents the original validation logic by one level such that it only gets called when the current Snakemake job has a `subsample` wildcard. --- workflow/snakemake_rules/main_workflow.smk | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 6e9a523fd..8a5d823d0 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -283,18 +283,18 @@ def _get_subsampling_settings(wildcards): if hasattr(wildcards, "subsample"): subsampling_settings = subsampling_settings[wildcards.subsample] - # If users have supplied both `max_sequences` and `seq_per_group`, we throw - # an error instead of assuming the user prefers one setting over another by - # default. - if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"): - raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.") - - # If users have supplied neither `max_sequences` nor `seq_per_group`, we - # throw an error because the subsampling rule will still group by one or - # more fields and the lack of limits on this grouping could produce - # unexpected behavior. - if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"): - raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.") + # If users have supplied both `max_sequences` and `seq_per_group`, we + # throw an error instead of assuming the user prefers one setting over + # another by default. + if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"): + raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.") + + # If users have supplied neither `max_sequences` nor `seq_per_group`, we + # throw an error because the subsampling rule will still group by one or + # more fields and the lack of limits on this grouping could produce + # unexpected behavior. + if not subsampling_settings.get("max_sequences") and not subsampling_settings.get("seq_per_group"): + raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' must define `max_sequences` or `seq_per_group`.") return subsampling_settings From 4a058156839dd2614aa7c69e183f832e09eebce8 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 8 Sep 2020 15:35:58 -0700 Subject: [PATCH 13/21] Warn users about conflicting subsampling schemes Prints an informative warning when users have defined a subsampling scheme that has the same name as one of the default schemes and different settings. The warning lists all conflicting schemes, explains why the conflict is a problem, provides a solution to avoid the conflicts, and alerts users that a future release of the workflow will treat these conflicts as errors. The use of the sleep function here is a workaround for the fact that Snakemake's output will quickly scroll past any warning we provide. By sleeping for a few seconds, users at least have a chance to see that there was a warning and can scroll back to the read the whole message. --- Snakefile | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Snakefile b/Snakefile index a3ecfe07b..fa7ba894e 100644 --- a/Snakefile +++ b/Snakefile @@ -1,11 +1,42 @@ +import copy from os import environ from socket import getfqdn from getpass import getuser +from snakemake.logging import logger from snakemake.utils import validate +import time + +# Store the user's configuration prior to loading defaults, so we can check for +# reused subsampling scheme names in the user's config. We need to make a deep +# copy because Snakemake will deep merge the subsampling dictionary later, +# modifying the values of a reference or shallow copy. +user_subsampling = copy.deepcopy(config.get("subsampling", {})) configfile: "defaults/parameters.yaml" validate(config, schema="workflow/schemas/config.schema.yaml") +# Check for overlapping subsampling schemes in user and default +# configurations. For now, issue a deprecation warning, so users know they +# should rename their subsampling schemes. In the future, this reuse of the same +# name will cause an error. +subsampling_config = config.get("subsampling", {}) +overlapping_schemes = [] +for scheme_name, scheme in user_subsampling.items(): + if scheme_name in subsampling_config and subsampling_config.get(scheme_name) != scheme: + overlapping_schemes.append(scheme_name) + +if len(overlapping_schemes) > 0: + logger.warning(f"WARNING: The following subsampling scheme(s) have the same name as a default scheme in this workflow but different definitions:") + logger.warning("") + for scheme in overlapping_schemes: + logger.warning(f" - {scheme}") + logger.warning("") + logger.warning(" This means Snakemake will merge your scheme with the default scheme and may produce unexpected behavior.") + logger.warning(f" To avoid errors in your workflow, rename your schemes with unique names (e.g., 'custom_{overlapping_schemes[0]}')") + logger.warning(" In future versions of this workflow, overlapping subsampling scheme names will produce an error.") + logger.warning("") + time.sleep(5) + # default build if none specified in config if "builds" not in config: config["builds"] = { From 538376e05316648c42f6bdd1c3a1b2d221bd1b0f Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 11 Sep 2020 14:03:21 -0700 Subject: [PATCH 14/21] Require latest version of augur (10.0.2) with BioPython bug fixes Since we are upgrading augur to 10.*.* for this PR anyway, we might as well use the latest version that includes bug fixes. --- workflow/envs/nextstrain.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml index 640a525cb..dd19dadd4 100644 --- a/workflow/envs/nextstrain.yaml +++ b/workflow/envs/nextstrain.yaml @@ -19,6 +19,6 @@ dependencies: - pip - pip: - awscli==1.18.45 - - nextstrain-augur==10.0.0 + - nextstrain-augur==10.0.2 - nextstrain-cli==1.16.5 - rethinkdb==2.3.0.post6 From 78e29d6d58a8b5a7c8a8602a57ae0af8aed4e158 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 14 Sep 2020 12:43:26 -0700 Subject: [PATCH 15/21] Add documentation for how config loading order occurs This documentation is for our own team when maintaining this code in the future and trying to understand why it works. There is also a good chance that Snakemake will change its implementation in a way that breaks our current logic. These comments should at least help us track down the changes. --- Snakefile | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index fa7ba894e..016e8d352 100644 --- a/Snakefile +++ b/Snakefile @@ -9,7 +9,23 @@ import time # Store the user's configuration prior to loading defaults, so we can check for # reused subsampling scheme names in the user's config. We need to make a deep # copy because Snakemake will deep merge the subsampling dictionary later, -# modifying the values of a reference or shallow copy. +# modifying the values of a reference or shallow copy. Note that this loading of +# the user's config prior to the defaults below depends on the order Snakemake +# loads its configfiles. Specifically, the order of config loading is: +# +# 1. First, configfile arguments are loaded and config is built from these [1]. +# 2. Then, config arguments are loaded and override existing config keys [2]. +# 3. Then, the Snakefile is parsed and configfile directive inside the Snakefile is processed [3]. +# When configfile is loaded from the directive in the Snakefile, the config +# dictionary is deep merged with the files [4] from the externally provided +# config files. This is the only place the deep merge happens using the +# update_config function [5]. +# +# [1] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L466-L471 +# [2] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L476-L477 +# [3] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/__init__.py#L551-L553 +# [4] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/workflow.py#L1088-L1094 +# [5] https://github.com/snakemake/snakemake/blob/a7ac40c96d6e2af47102563d0478a2220e2a2ab7/snakemake/utils.py#L455-L476 user_subsampling = copy.deepcopy(config.get("subsampling", {})) configfile: "defaults/parameters.yaml" From 01005591079c7ab7ea00daf892daea5c41f7948e Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 14 Sep 2020 12:45:36 -0700 Subject: [PATCH 16/21] Remove unnecessary empty subsampling settings from example profiles --- defaults/parameters.yaml | 21 ------------------- .../builds.yaml | 21 +------------------ 2 files changed, 1 insertion(+), 41 deletions(-) diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index e89f2292c..7731d9376 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -122,20 +122,12 @@ subsampling: # Focal samples for region region: group_by: "division year month" - # augur filter's `seq_per_group` and `max_sequences` are mutually - # exclusive, but one of these arguments is required for each subsampling - # scheme. To override these defaults in your own subsampling schemes of - # the same name, provide a value for the limit you want to use and an - # empty entry for the limit you don't use. The entry below demonstrates - # this approach. seq_per_group: 48 - max_sequences: exclude: "--exclude-where 'region!={region}'" # Contextual samples for region from the rest of the world global: group_by: "country year month" seq_per_group: 4 - max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -146,7 +138,6 @@ subsampling: global: group_by: "country year month" seq_per_group: 20 - max_sequences: # Custom subsampling logic for regions like Europe where grouping by country # is the smallest resolution required. @@ -155,13 +146,11 @@ subsampling: region: group_by: "country year month" seq_per_group: 64 - max_sequences: exclude: "--exclude-where 'region!={region}'" # Contextual samples for region from the rest of the world global: group_by: "country year month" seq_per_group: 6 - max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -173,13 +162,11 @@ subsampling: country: group_by: "division year month" seq_per_group: 200 - max_sequences: exclude: "--exclude-where 'country!={country}'" # Contextual samples from country's region region: group_by: "country year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'country={country}' 'region!={region}'" priorities: type: "proximity" @@ -189,7 +176,6 @@ subsampling: global: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -201,13 +187,11 @@ subsampling: division: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'" # Contextual samples from division's country country: group_by: "division year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'" priorities: type: "proximity" @@ -216,7 +200,6 @@ subsampling: region: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'region!={region}' 'country={country}'" priorities: type: "proximity" @@ -226,7 +209,6 @@ subsampling: global: group_by: "country year month" seq_per_group: 5 - max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -238,13 +220,11 @@ subsampling: focal: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}' 'location!={location}'" # Samples that are genetically related to the focal samples related: group_by: "country year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'location={location}'" priorities: type: "proximity" @@ -253,5 +233,4 @@ subsampling: contextual: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'location={location}'" diff --git a/my_profiles/example_advanced_customization/builds.yaml b/my_profiles/example_advanced_customization/builds.yaml index c914a1e6f..f60669423 100644 --- a/my_profiles/example_advanced_customization/builds.yaml +++ b/my_profiles/example_advanced_customization/builds.yaml @@ -34,24 +34,16 @@ builds: subsampling: # Here we use the default subsampling logic for countries - # augur filter's `seq_per_group` and `max_sequences` are mutually - # exclusive, but one of these arguments is required for each subsampling - # scheme. To override these defaults in your own subsampling schemes of - # the same name, provide a value for the limit you want to use and an - # empty entry for the limit you don't use. The entry below demonstrates - # this approach. switzerland: # Focal samples for country country: group_by: "division year month" - seq_per_group: max_sequences: 1500 exclude: "--exclude-where 'country!={country}'" # Contextual samples from country's region region: group_by: "country year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'country={country}' 'region!={region}'" priorities: type: "proximity" @@ -61,7 +53,6 @@ subsampling: global: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -74,13 +65,11 @@ subsampling: division: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'" # Contextual samples from division's country country: group_by: "division year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'" priorities: type: "proximity" @@ -89,7 +78,6 @@ subsampling: region: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'region!={region}' 'country={country}'" priorities: type: "proximity" @@ -99,44 +87,38 @@ subsampling: global: group_by: "country year month" seq_per_group: 5 - max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" focus: "division" # This build will take from 3 cantons - we have a sample rule for each, - # rather than just one division that's focal build + # rather than just one division that's focal build lac-leman: # focal samples geneva: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'division!=geneva'" vaud: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'division!=vaud'" valais: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'division!=valais'" # Contextual samples from the country country: group_by: "division year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'country!=switzerland'" # Contextual samples from division's region region: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'region!=europe'" priorities: type: "proximity" @@ -146,7 +128,6 @@ subsampling: global: group_by: "country year month" seq_per_group: 5 - max_sequences: exclude: "--exclude-where 'region=europe'" priorities: type: "proximity" From 91f95b3f7d82f6d7a53bd32ba73793e4a4910d85 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 14 Sep 2020 12:46:15 -0700 Subject: [PATCH 17/21] Use a unique subsampling scheme name for the example profile The original county-level subsampling scheme for the example profile used the same name as the default "county" scheme we provide. This commit renames that scheme to "custom-county" to avoid conflicts with the default schemes. --- my_profiles/example/builds.yaml | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml index 916f6aa84..eea133e11 100644 --- a/my_profiles/example/builds.yaml +++ b/my_profiles/example/builds.yaml @@ -16,7 +16,7 @@ builds: # with a build name that will produce the following URL fragment on Nextstrain/auspice: # /ncov/north-america/usa/washington/king-county north-america_usa_washington_king-county: # name of the build; this can be anything - subsampling_scheme: county # use a custom subsampling scheme defined below + subsampling_scheme: custom-county # use a custom subsampling scheme defined below region: North America country: USA division: Washington @@ -60,18 +60,11 @@ builds: # Define custom subsampling logic for county-level builds. subsampling: - county: + custom-county: # Focal samples for location focal: group_by: "year month" - # augur filter's `seq_per_group` and `max_sequences` are mutually - # exclusive, but one of these arguments is required for each subsampling - # scheme. To override these defaults in your own subsampling schemes of - # the same name, provide a value for the limit you want to use and an - # empty entry for the limit you don't use. The entry below demonstrates - # this approach. seq_per_group: 300 - max_sequences: # Use augur filter's query argument to filter strains with pandas-style logic expressions. # This can be easier to read than the corresponding filter by exclusion. query: --query "(country == '{country}') & (division == '{division}') & (location == '{location}')" @@ -80,7 +73,6 @@ subsampling: related: group_by: "country year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'location={location}'" priorities: type: "proximity" @@ -90,7 +82,6 @@ subsampling: contextual: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'location={location}'" # Here, you can specify what type of auspice_config you want to use From 1f531cd6b11cebccadde3a6ff35fdebf1408cfa8 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 14 Sep 2020 12:48:03 -0700 Subject: [PATCH 18/21] Print a better explanation for how to resolve conflicting subsamples Instead of recommending that users enter an empty subsampling setting when a conflict occurs, we ask them to rename their conflicting subsampling scheme to avoid any future conflicts and unexpected behavior. --- workflow/snakemake_rules/main_workflow.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 8a5d823d0..829f3ece3 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -287,7 +287,7 @@ def _get_subsampling_settings(wildcards): # throw an error instead of assuming the user prefers one setting over # another by default. if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"): - raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be due to a default value - add an empty value for the argument you wish to ignore to override the default.") + raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be caused by using the same subsampling scheme name as a default scheme. If so, try renaming your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}').") # If users have supplied neither `max_sequences` nor `seq_per_group`, we # throw an error because the subsampling rule will still group by one or From ded0e42171d255811322cd8c82acafcacf9b1a59 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 14 Sep 2020 12:56:09 -0700 Subject: [PATCH 19/21] Remove references to empty subsampling attributes from tutorial --- docs/customizing-analysis.md | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/docs/customizing-analysis.md b/docs/customizing-analysis.md index c452f3c75..bbace2918 100644 --- a/docs/customizing-analysis.md +++ b/docs/customizing-analysis.md @@ -71,14 +71,12 @@ switzerland: # Focal samples for country country: group_by: "division year month" - seq_per_group: max_sequences: 1500 exclude: "--exclude-where 'country!={country}'" # Contextual samples from country's region region: group_by: "country year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'country={country}' 'region!={region}'" priorities: type: "proximity" @@ -88,7 +86,6 @@ switzerland: global: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'region={region}'" priorities: type: "proximity" @@ -99,9 +96,7 @@ For `country`-level sampling above, we specify that we want a maximum of 1,500 s Since we set `group_by` to "division year month", all the Swiss sequences will be divided into groups by their division, month, and year of sampling, and the code will try to equally sample from each group to reach 1,500 sequences total. Alternatively, in the `region`-level sampling, we set `seq_per_group` to 20. -This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken). - -Since `max_sequences` and `seq_per_group` are mutually exclusive, you should supply a blank entry for whichever you don't want to use, as shown above. +This means that all the sequences from Europe (excluding Switzerland) will be divided into groups by their sampling country, month, and year (as defined by `group_by`), and then 20 sequences will taken from each group (if there are fewer than 20 in any given group, all of the samples from that group will be taken). Now we'll look at a subsampling scheme which defines a multi-`canton` build. Cantons are regional divisions in Switzerland - below 'country,' but above 'location' (often city-level). @@ -116,31 +111,26 @@ lac-leman: geneva: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'division!=geneva'" vaud: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'division!=vaud'" valais: group_by: "year month" seq_per_group: 300 - max_sequences: exclude: "--exclude-where 'division!=valais'" # Contextual samples from the country country: group_by: "division year month" seq_per_group: 20 - max_sequences: exclude: "--exclude-where 'country!=switzerland'" # Contextual samples from division's region region: group_by: "country year month" seq_per_group: 10 - max_sequences: exclude: "--exclude-where 'region!=europe'" priorities: type: "proximity" @@ -150,7 +140,6 @@ lac-leman: global: group_by: "country year month" seq_per_group: 5 - max_sequences: exclude: "--exclude-where 'region=europe'" priorities: type: "proximity" From be3984150cec6d1f4069074d41982af07f8636dc Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 14 Sep 2020 13:08:11 -0700 Subject: [PATCH 20/21] Refine error message when user has a subsampling scheme conflict This message asks the user to first check whether they defined conflicting attributes before checking whether they have a conflicting subsampling scheme name. --- workflow/snakemake_rules/main_workflow.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 829f3ece3..c0692d1d0 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -287,7 +287,7 @@ def _get_subsampling_settings(wildcards): # throw an error instead of assuming the user prefers one setting over # another by default. if subsampling_settings.get("max_sequences") and subsampling_settings.get("seq_per_group"): - raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. This could be caused by using the same subsampling scheme name as a default scheme. If so, try renaming your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}').") + raise Exception(f"The subsampling scheme '{subsampling_scheme}' for build '{wildcards.build_name}' defines both `max_sequences` and `seq_per_group`, but these arguments are mutually exclusive. If you didn't define both of these settings, this conflict could be caused by using the same subsampling scheme name as a default scheme. In this case, rename your subsampling scheme, '{subsampling_scheme}', to a unique name (e.g., 'custom_{subsampling_scheme}') and run the workflow again.") # If users have supplied neither `max_sequences` nor `seq_per_group`, we # throw an error because the subsampling rule will still group by one or From 5a44a9792eb05fa5b8f7a002cc30c8b6cbb074ec Mon Sep 17 00:00:00 2001 From: Emma Hodcroft Date: Tue, 15 Sep 2020 17:20:33 +0200 Subject: [PATCH 21/21] rename nextstrain profiles not to clash with defaults --- nextstrain_profiles/nextstrain/builds.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nextstrain_profiles/nextstrain/builds.yaml b/nextstrain_profiles/nextstrain/builds.yaml index 27052d345..729872544 100644 --- a/nextstrain_profiles/nextstrain/builds.yaml +++ b/nextstrain_profiles/nextstrain/builds.yaml @@ -8,30 +8,30 @@ custom_rules: # (They override the defaults) builds: global: - subsampling_scheme: region_global + subsampling_scheme: nextstrain_region_global auspice_config: "nextstrain_profiles/nextstrain/global_auspice_config.json" africa: - subsampling_scheme: region + subsampling_scheme: nextstrain_region region: Africa auspice_config: "nextstrain_profiles/nextstrain/africa_auspice_config.json" asia: - subsampling_scheme: region + subsampling_scheme: nextstrain_region region: Asia auspice_config: "nextstrain_profiles/nextstrain/asia_auspice_config.json" europe: - subsampling_scheme: region_grouped_by_country + subsampling_scheme: nextstrain_region_grouped_by_country region: Europe auspice_config: "nextstrain_profiles/nextstrain/europe_auspice_config.json" north-america: - subsampling_scheme: region + subsampling_scheme: nextstrain_region region: North America auspice_config: "nextstrain_profiles/nextstrain/north-america_auspice_config.json" oceania: - subsampling_scheme: region + subsampling_scheme: nextstrain_region region: Oceania auspice_config: "nextstrain_profiles/nextstrain/oceania_auspice_config.json" south-america: - subsampling_scheme: region + subsampling_scheme: nextstrain_region region: South America auspice_config: "nextstrain_profiles/nextstrain/south-america_auspice_config.json" @@ -105,7 +105,7 @@ slack_channel: "#ncov-gisaid-updates" subsampling: # Custom subsampling logic for regions - region: + nextstrain_region: # Focal samples for region region: group_by: "division year month" @@ -121,14 +121,14 @@ subsampling: focus: "region" # Custom subsampling logic for global region. - region_global: + nextstrain_region_global: global: group_by: "country year month" seq_per_group: 16 # Custom subsampling for regions like Europe where grouping by country # is the smallest resolution requied - region_grouped_by_country: + nextstrain_region_grouped_by_country: # Focal samples for region region: group_by: "country year month"