Update global subsampling

In the current "global" analyses, treating China and India each as just another country in Asia was resulting in much smaller per-capita sampling rates. For example, in the current gisaid/global/6m tree we have 66 viruses from Guatemala (population 17M), 62 viruses from Costa Rica (population 5M), 18 viruses from India (population 1400M) and 21 viruses from China (population 1400M). This is a ~1000-fold difference in per-capita sampling intensity. This commit partially addresses this issue by splitting out China and India into their own buckets when subsampling. This results in buckets of North America (580M), South America (420M), Europe (750M), Africa (1.2B), Oceania (44M), India (1.4B), China (1.4B) and Asia minus India and China (1.8B). Additionally, this commit makes a small correction to reduce Oceania to 20% region count relative to other regions from previous 33%.
nextstrain · Jun 14, 2023 · 9fd01c6 · 9fd01c6
1 parent a29a964
commit 9fd01c6
Show file tree

Hide file tree

Showing 3 changed files with 362 additions and 158 deletions.
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -389,33 +389,43 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for global region over 1m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -432,12 +442,22 @@ subsampling:
       group_by: "country week"
       max_sequences: 600
       min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
       max_sequences: 600
       min_date: "--min-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
       max_sequences: 600
@@ -455,33 +475,43 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 2m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -491,128 +521,166 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
       min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division week"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 6m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 50
+      max_sequences: 25
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over all-time
-  # 4000 total
-  # all regions equal except Oceania at 33%
+  # 4320 total (expect ~3200)
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_all_time:
     africa:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Africa'"
     asia:
       group_by: "country year month"
-      max_sequences: 750
-      exclude: "--exclude-where 'region!=Asia'"
+      max_sequences: 600
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=China'"
     europe:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Europe'"
+    india:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=India'"
     north_america:
       group_by: "division year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=North America'"
     south_america:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=South America'"
     oceania:
       group_by: "division year month"
-      max_sequences: 250
+      max_sequences: 120
       exclude: "--exclude-where 'region!=Oceania'"
 
 # Root to clade 21L