Skip to content

Commit

Permalink
🚧 Use population-based weighted sampling for Asia builds
Browse files Browse the repository at this point in the history
This replaces the Asia/China/India split with population-based weighted
sampling (possible in Augur version X.X.X).

This requires changing the geographical grouping resolution from
division to country, but I assume it was only grouped by division in an
attempt to have varying group sizes per country, and that
population-based weighting is an acceptable replacement.
  • Loading branch information
victorlin committed May 3, 2024
1 parent ba0d7ea commit 31e1099
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 225 deletions.
144 changes: 32 additions & 112 deletions nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -273,30 +273,18 @@ subsampling:

# Custom subsampling logic for region Asia over 1m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_1m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "data/country_population_weights.tsv"
max_sequences: 700
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -305,22 +293,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "data/country_population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
Expand All @@ -330,30 +307,18 @@ subsampling:

# Custom subsampling logic for region Asia over 2m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_2m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "data/country_population_weights.tsv"
max_sequences: 700
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -362,22 +327,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "data/country_population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
Expand All @@ -387,30 +341,18 @@ subsampling:

# Custom subsampling logic for region Asia over 6m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_6m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "data/country_population_weights.tsv"
max_sequences: 700
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -419,22 +361,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division year month"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "data/country_population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country year month"
Expand All @@ -443,27 +374,16 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over all-time
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_all_time:
# Focal samples for Asia
asia:
group_by: "division year month"
max_sequences: 1500
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Focal samples for China
china:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=China'"
# Focal samples for India
india:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=India'"
group_by: "country year month"
group_by_weights: "data/country_population_weights.tsv"
max_sequences: 3500
exclude: "--exclude-where 'region!=Asia'"
# Contextual samples from the rest of the world
context:
group_by: "country year month"
Expand Down

0 comments on commit 31e1099

Please sign in to comment.