From 9fd01c64f9471638757ab6ce2dd6a43fe090bfeb Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Wed, 14 Jun 2023 15:55:46 -0700 Subject: [PATCH] Update global subsampling In the current "global" analyses, treating China and India each as just another country in Asia was resulting in much smaller per-capita sampling rates. For example, in the current gisaid/global/6m tree we have 66 viruses from Guatemala (population 17M), 62 viruses from Costa Rica (population 5M), 18 viruses from India (population 1400M) and 21 viruses from China (population 1400M). This is a ~1000-fold difference in per-capita sampling intensity. This commit partially addresses this issue by splitting out China and India into their own buckets when subsampling. This results in buckets of North America (580M), South America (420M), Europe (750M), Africa (1.2B), Oceania (44M), India (1.4B), China (1.4B) and Asia minus India and China (1.8B). Additionally, this commit makes a small correction to reduce Oceania to 20% region count relative to other regions from previous 33%. --- .../nextstrain-gisaid-21L/builds.yaml | 172 +++++++++++------ .../nextstrain-gisaid/builds.yaml | 172 +++++++++++------ .../nextstrain-open/builds.yaml | 176 ++++++++++++------ 3 files changed, 362 insertions(+), 158 deletions(-) diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml index b6c8116aa..39a872a55 100644 --- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml @@ -389,33 +389,43 @@ subsampling: exclude: "--exclude-where 'region={region}'" # Custom subsampling logic for global region over 1m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_1m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 1M" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 1M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=South America'" oceania_early: @@ -432,12 +442,22 @@ subsampling: group_by: "country week" max_sequences: 600 min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 1M" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country week" max_sequences: 600 min_date: "--min-date 1M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 1M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division week" max_sequences: 600 @@ -455,33 +475,43 @@ subsampling: exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over 2m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_2m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 2M" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 2M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=South America'" oceania_early: @@ -491,128 +521,166 @@ subsampling: exclude: "--exclude-where 'region!=Oceania'" africa_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Africa'" asia_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 + min_date: "--min-date 2M" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division week" + max_sequences: 500 min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 2M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=North America'" south_america_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=South America'" oceania_recent: group_by: "division week" - max_sequences: 200 + max_sequences: 100 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over 6m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_6m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 6M" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 6M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_early: group_by: "division year month" - max_sequences: 50 + max_sequences: 25 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Oceania'" africa_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division year month" + max_sequences: 500 + min_date: "--min-date 6M" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division year month" + max_sequences: 500 + min_date: "--min-date 6M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_recent: group_by: "division year month" - max_sequences: 200 + max_sequences: 100 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over all-time - # 4000 total - # all regions equal except Oceania at 33% + # 4320 total (expect ~3200) + # all eight regions equal except Oceania at 20% nextstrain_global_all_time: africa: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=Africa'" asia: group_by: "country year month" - max_sequences: 750 - exclude: "--exclude-where 'region!=Asia'" + max_sequences: 600 + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china: + group_by: "division year month" + max_sequences: 600 + exclude: "--exclude-where 'country!=China'" europe: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=Europe'" + india: + group_by: "division year month" + max_sequences: 600 + exclude: "--exclude-where 'country!=India'" north_america: group_by: "division year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=North America'" south_america: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=South America'" oceania: group_by: "division year month" - max_sequences: 250 + max_sequences: 120 exclude: "--exclude-where 'region!=Oceania'" # Root to clade 21L diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 751b0bc4a..deb8de43e 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -381,33 +381,43 @@ subsampling: exclude: "--exclude-where 'region={region}'" # Custom subsampling logic for global region over 1m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_1m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 1M" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 1M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=South America'" oceania_early: @@ -424,12 +434,22 @@ subsampling: group_by: "country week" max_sequences: 600 min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 1M" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country week" max_sequences: 600 min_date: "--min-date 1M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 1M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division week" max_sequences: 600 @@ -447,33 +467,43 @@ subsampling: exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over 2m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_2m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 2M" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 2M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=South America'" oceania_early: @@ -483,128 +513,166 @@ subsampling: exclude: "--exclude-where 'region!=Oceania'" africa_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Africa'" asia_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 + min_date: "--min-date 2M" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division week" + max_sequences: 500 min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 2M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=North America'" south_america_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=South America'" oceania_recent: group_by: "division week" - max_sequences: 200 + max_sequences: 100 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over 6m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_6m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 6M" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 6M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_early: group_by: "division year month" - max_sequences: 50 + max_sequences: 25 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Oceania'" africa_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division year month" + max_sequences: 500 + min_date: "--min-date 6M" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division year month" + max_sequences: 500 + min_date: "--min-date 6M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_recent: group_by: "division year month" - max_sequences: 200 + max_sequences: 100 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over all-time - # 4000 total - # all regions equal except Oceania at 33% + # 4320 total (expect ~3200) + # all eight regions equal except Oceania at 20% nextstrain_global_all_time: africa: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=Africa'" asia: group_by: "country year month" - max_sequences: 750 - exclude: "--exclude-where 'region!=Asia'" + max_sequences: 600 + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china: + group_by: "division year month" + max_sequences: 600 + exclude: "--exclude-where 'country!=China'" europe: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=Europe'" + india: + group_by: "division year month" + max_sequences: 600 + exclude: "--exclude-where 'country!=India'" north_america: group_by: "division year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=North America'" south_america: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=South America'" oceania: group_by: "division year month" - max_sequences: 250 + max_sequences: 120 exclude: "--exclude-where 'region!=Oceania'" # if different traits should be reconstructed for some builds, specify here diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index 25b32ea57..2d0eabeab 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -381,33 +381,43 @@ subsampling: exclude: "--exclude-where 'region={region}'" # Custom subsampling logic for global region over 1m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_1m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 + max_date: "--max-date 1M" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 1M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 1M" exclude: "--exclude-where 'region!=South America'" oceania_early: @@ -424,15 +434,25 @@ subsampling: group_by: "country week" max_sequences: 600 min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 1M" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country week" - max_sequences: 1500 + max_sequences: 600 min_date: "--min-date 1M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 1M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division week" - max_sequences: 1500 + max_sequences: 600 min_date: "--min-date 1M" exclude: "--exclude-where 'region!=North America'" south_america_recent: @@ -447,33 +467,43 @@ subsampling: exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over 2m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_2m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 + max_date: "--max-date 2M" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 2M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 2M" exclude: "--exclude-where 'region!=South America'" oceania_early: @@ -483,128 +513,166 @@ subsampling: exclude: "--exclude-where 'region!=Oceania'" africa_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Africa'" asia_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 2M" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division week" + max_sequences: 500 + min_date: "--min-date 2M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=North America'" south_america_recent: group_by: "country week" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=South America'" oceania_recent: group_by: "division week" - max_sequences: 200 + max_sequences: 100 min_date: "--min-date 2M" exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over 6m - # 4000 total - # 4:1 ratio of focal to context - # all regions equal except Oceania at 33% + # 5125 total (expect ~3400) + # 4:1 ratio of recent to early + # all eight regions equal except Oceania at 20% nextstrain_global_6m: africa_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 6M" + exclude: "--exclude-where 'country!=China'" europe_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Europe'" + india_early: + group_by: "division year month" + max_sequences: 125 + max_date: "--max-date 6M" + exclude: "--exclude-where 'country!=India'" north_america_early: group_by: "division year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_early: group_by: "country year month" - max_sequences: 150 + max_sequences: 125 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_early: group_by: "division year month" - max_sequences: 50 + max_sequences: 25 max_date: "--max-date 6M" exclude: "--exclude-where 'region!=Oceania'" africa_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Africa'" asia_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia'" + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china_recent: + group_by: "division year month" + max_sequences: 500 + min_date: "--min-date 6M" + exclude: "--exclude-where 'country!=China'" europe_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Europe'" + india_recent: + group_by: "division year month" + max_sequences: 500 + min_date: "--min-date 6M" + exclude: "--exclude-where 'country!=India'" north_america_recent: group_by: "division year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=North America'" south_america_recent: group_by: "country year month" - max_sequences: 600 + max_sequences: 500 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=South America'" oceania_recent: group_by: "division year month" - max_sequences: 200 + max_sequences: 100 min_date: "--min-date 6M" exclude: "--exclude-where 'region!=Oceania'" # Custom subsampling logic for global region over all-time - # 4000 total - # all regions equal except Oceania at 33% + # 4320 total (expect ~3200) + # all eight regions equal except Oceania at 20% nextstrain_global_all_time: africa: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=Africa'" asia: group_by: "country year month" - max_sequences: 750 - exclude: "--exclude-where 'region!=Asia'" + max_sequences: 600 + exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" + china: + group_by: "division year month" + max_sequences: 600 + exclude: "--exclude-where 'country!=China'" europe: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=Europe'" + india: + group_by: "division year month" + max_sequences: 600 + exclude: "--exclude-where 'country!=India'" north_america: group_by: "division year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=North America'" south_america: group_by: "country year month" - max_sequences: 750 + max_sequences: 600 exclude: "--exclude-where 'region!=South America'" oceania: group_by: "division year month" - max_sequences: 250 + max_sequences: 120 exclude: "--exclude-where 'region!=Oceania'" # GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build