Skip to content

Commit

Permalink
Update global subsampling
Browse files Browse the repository at this point in the history
In the current "global" analyses, treating China and India each as just another country in Asia was resulting in much smaller per-capita sampling rates. For example, in the current gisaid/global/6m tree we have 66 viruses from Guatemala (population 17M), 62 viruses from Costa Rica (population 5M), 18 viruses from India (population 1400M) and 21 viruses from China (population 1400M). This is a ~1000-fold difference in per-capita sampling intensity.

This commit partially addresses this issue by splitting out China and India into their own buckets when subsampling. This results in buckets of North America (580M), South America (420M), Europe (750M), Africa (1.2B), Oceania (44M), India (1.4B), China (1.4B) and Asia minus India and China (1.8B).

Additionally, this commit makes a small correction to reduce Oceania to 20% region count relative to other regions from previous 33%.
  • Loading branch information
trvrb committed Jun 14, 2023
1 parent a29a964 commit 9fd01c6
Show file tree
Hide file tree
Showing 3 changed files with 362 additions and 158 deletions.
172 changes: 120 additions & 52 deletions nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -389,33 +389,43 @@ subsampling:
exclude: "--exclude-where 'region={region}'"

# Custom subsampling logic for global region over 1m
# 4000 total
# 4:1 ratio of focal to context
# all regions equal except Oceania at 33%
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
nextstrain_global_1m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Asia'"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
Expand All @@ -432,12 +442,22 @@ subsampling:
group_by: "country week"
max_sequences: 600
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Asia'"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_recent:
group_by: "division week"
max_sequences: 500
min_date: "--min-date 1M"
exclude: "--exclude-where 'country!=China'"
europe_recent:
group_by: "country week"
max_sequences: 600
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Europe'"
india_recent:
group_by: "division week"
max_sequences: 500
min_date: "--min-date 1M"
exclude: "--exclude-where 'country!=India'"
north_america_recent:
group_by: "division week"
max_sequences: 600
Expand All @@ -455,33 +475,43 @@ subsampling:
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over 2m
# 4000 total
# 4:1 ratio of focal to context
# all regions equal except Oceania at 33%
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
nextstrain_global_2m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Asia'"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
Expand All @@ -491,128 +521,166 @@ subsampling:
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
group_by: "country week"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Africa'"
asia_recent:
group_by: "country week"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_recent:
group_by: "division week"
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Asia'"
exclude: "--exclude-where 'country!=China'"
europe_recent:
group_by: "country week"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Europe'"
india_recent:
group_by: "division week"
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'country!=India'"
north_america_recent:
group_by: "division week"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=North America'"
south_america_recent:
group_by: "country week"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=South America'"
oceania_recent:
group_by: "division week"
max_sequences: 200
max_sequences: 100
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over 6m
# 4000 total
# 4:1 ratio of focal to context
# all regions equal except Oceania at 33%
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
nextstrain_global_6m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Asia'"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
group_by: "division year month"
max_sequences: 50
max_sequences: 25
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
group_by: "country year month"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Africa'"
asia_recent:
group_by: "country year month"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Asia'"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_recent:
group_by: "division year month"
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'country!=China'"
europe_recent:
group_by: "country year month"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Europe'"
india_recent:
group_by: "division year month"
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'country!=India'"
north_america_recent:
group_by: "division year month"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=North America'"
south_america_recent:
group_by: "country year month"
max_sequences: 600
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=South America'"
oceania_recent:
group_by: "division year month"
max_sequences: 200
max_sequences: 100
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over all-time
# 4000 total
# all regions equal except Oceania at 33%
# 4320 total (expect ~3200)
# all eight regions equal except Oceania at 20%
nextstrain_global_all_time:
africa:
group_by: "country year month"
max_sequences: 750
max_sequences: 600
exclude: "--exclude-where 'region!=Africa'"
asia:
group_by: "country year month"
max_sequences: 750
exclude: "--exclude-where 'region!=Asia'"
max_sequences: 600
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china:
group_by: "division year month"
max_sequences: 600
exclude: "--exclude-where 'country!=China'"
europe:
group_by: "country year month"
max_sequences: 750
max_sequences: 600
exclude: "--exclude-where 'region!=Europe'"
india:
group_by: "division year month"
max_sequences: 600
exclude: "--exclude-where 'country!=India'"
north_america:
group_by: "division year month"
max_sequences: 750
max_sequences: 600
exclude: "--exclude-where 'region!=North America'"
south_america:
group_by: "country year month"
max_sequences: 750
max_sequences: 600
exclude: "--exclude-where 'region!=South America'"
oceania:
group_by: "division year month"
max_sequences: 250
max_sequences: 120
exclude: "--exclude-where 'region!=Oceania'"

# Root to clade 21L
Expand Down

0 comments on commit 9fd01c6

Please sign in to comment.