diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 4034e4730..56242ea2c 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -273,30 +273,18 @@ subsampling: # Custom subsampling logic for region Asia over 1m # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_1m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 700 max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -305,22 +293,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division week" - max_sequences: 1200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country week" @@ -330,30 +307,18 @@ subsampling: # Custom subsampling logic for region Asia over 2m # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_2m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 700 max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -362,22 +327,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division week" - max_sequences: 1200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country week" @@ -387,30 +341,18 @@ subsampling: # Custom subsampling logic for region Asia over 6m # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_6m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 700 max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -419,22 +361,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division year month" - max_sequences: 1200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country year month" @@ -443,27 +374,16 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over all-time - # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_all_time: # Focal samples for Asia asia: - group_by: "division year month" - max_sequences: 1500 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Focal samples for China - china: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=China'" - # Focal samples for India - india: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=India'" + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 3500 + exclude: "--exclude-where 'region!=Asia'" # Contextual samples from the rest of the world context: group_by: "country year month" diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index 40d5ff3ee..3518ff964 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -273,30 +273,18 @@ subsampling: # Custom subsampling logic for region Asia over 1m # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_1m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 700 max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -305,22 +293,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division week" - max_sequences: 1200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country week" @@ -330,30 +307,18 @@ subsampling: # Custom subsampling logic for region Asia over 2m # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_2m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 700 max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -362,22 +327,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division week" - max_sequences: 1200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country week" @@ -387,30 +341,18 @@ subsampling: # Custom subsampling logic for region Asia over 6m # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_6m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 700 max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -419,22 +361,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division year month" - max_sequences: 1200 + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country year month" @@ -443,27 +374,16 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over all-time - # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_grouped_by_division_all_time: # Focal samples for Asia asia: - group_by: "division year month" - max_sequences: 1500 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Focal samples for China - china: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=China'" - # Focal samples for India - india: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=India'" + group_by: "country year month" + group_by_weights: "data/country_population_weights.tsv" + max_sequences: 3500 + exclude: "--exclude-where 'region!=Asia'" # Contextual samples from the rest of the world context: group_by: "country year month" diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 82922157e..396b0c7a0 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -190,6 +190,8 @@ def _get_specific_subsampling_setting(setting, optional=False): value = f"--exclude-ambiguous-dates-by {value}" elif setting == 'group_by': value = f"--group-by {value}" + elif setting == 'group_by_weights': + value = f"--group-by-weights {value}" elif value is not None: # If is 'seq_per_group' or 'max_sequences' build subsampling setting, # need to return the 'argument' for augur @@ -265,6 +267,14 @@ rule index_sequences: --output {output.sequence_index} 2>&1 | tee {log} """ +rule get_weights: + output: "data/country_population_weights.tsv" + shell: + """ + python3 scripts/get_population_sizes.py \ + --output {output} + """ + rule subsample: message: """ @@ -285,7 +295,11 @@ rule subsample: metadata = _get_unified_metadata, include = config["files"]["include"], priorities = get_priorities, - exclude = config["files"]["exclude"] + exclude = config["files"]["exclude"], + # FIXME: check if one weights file for all calls is appropriate. so + # far it seems fine, but maybe not in the future if weighting + # columns will vary across different samples. + weights = "data/country_population_weights.tsv" output: strains="results/{build_name}/sample-{subsample}.txt", log: @@ -294,6 +308,7 @@ rule subsample: "benchmarks/subsample_{build_name}_{subsample}.txt" params: group_by = _get_specific_subsampling_setting("group_by", optional=True), + group_by_weights = _get_specific_subsampling_setting("group_by_weights", optional=True), sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True), subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True), sampling_scheme = _get_specific_subsampling_setting("sampling_scheme", optional=True), @@ -323,6 +338,7 @@ rule subsample: {params.exclude_ambiguous_dates_argument} \ {params.priority_argument} \ {params.group_by} \ + {params.group_by_weights} \ {params.sequences_per_group} \ {params.subsample_max_sequences} \ {params.sampling_scheme} \