🚧 Use population-based weighted sampling for Asia builds

This replaces the Asia/China/India split with population-based weighted sampling (possible in Augur version X.X.X). This requires changing the geographical grouping resolution from division to country, but I assume it was only grouped by division in an attempt to have varying group sizes per country, and that population-based weighting is an acceptable replacement.
nextstrain · May 3, 2024 · 31e1099 · 31e1099
1 parent ba0d7ea
commit 31e1099
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 225 deletions.
diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -273,30 +273,18 @@ subsampling:
 
   # Custom subsampling logic for region Asia over 1m
   # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_1m:
     # Early focal samples for Asia
     asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
+      group_by: "country year month"
+      group_by_weights: "data/country_population_weights.tsv"
+      max_sequences: 700
       max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
@@ -305,22 +293,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "division week"
-      max_sequences: 1200
+      group_by: "country year month"
+      group_by_weights: "data/country_population_weights.tsv"
+      max_sequences: 2800
       min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_recent:
       group_by: "country week"
@@ -330,30 +307,18 @@ subsampling:
 
   # Custom subsampling logic for region Asia over 2m
   # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_2m:
     # Early focal samples for Asia
     asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
+      group_by: "country year month"
+      group_by_weights: "data/country_population_weights.tsv"
+      max_sequences: 700
       max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
@@ -362,22 +327,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "division week"
-      max_sequences: 1200
+      group_by: "country year month"
+      group_by_weights: "data/country_population_weights.tsv"
+      max_sequences: 2800
       min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_recent:
       group_by: "country week"
@@ -387,30 +341,18 @@ subsampling:
 
   # Custom subsampling logic for region Asia over 6m
   # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_6m:
     # Early focal samples for Asia
     asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
+      group_by: "country year month"
+      group_by_weights: "data/country_population_weights.tsv"
+      max_sequences: 700
       max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
@@ -419,22 +361,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
+      group_by: "country year month"
+      group_by_weights: "data/country_population_weights.tsv"
+      max_sequences: 2800
       min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_recent:
       group_by: "country year month"
@@ -443,27 +374,16 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
 
   # Custom subsampling logic for region Asia over all-time
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_all_time:
     # Focal samples for Asia
     asia:
-      group_by: "division year month"
-      max_sequences: 1500
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Focal samples for China
-    china:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=China'"
-    # Focal samples for India
-    india:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=India'"
+      group_by: "country year month"
+      group_by_weights: "data/country_population_weights.tsv"
+      max_sequences: 3500
+      exclude: "--exclude-where 'region!=Asia'"
     # Contextual samples from the rest of the world
     context:
       group_by: "country year month"