From 9fd01c64f9471638757ab6ce2dd6a43fe090bfeb Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Wed, 14 Jun 2023 15:55:46 -0700
Subject: [PATCH] Update global subsampling

In the current "global" analyses, treating China and India each as just another country in Asia was resulting in much smaller per-capita sampling rates. For example, in the current gisaid/global/6m tree we have 66 viruses from Guatemala (population 17M), 62 viruses from Costa Rica (population 5M), 18 viruses from India (population 1400M) and 21 viruses from China (population 1400M). This is a ~1000-fold difference in per-capita sampling intensity.

This commit partially addresses this issue by splitting out China and India into their own buckets when subsampling. This results in buckets of North America (580M), South America (420M), Europe (750M), Africa (1.2B), Oceania (44M), India (1.4B), China (1.4B) and Asia minus India and China (1.8B).

Additionally, this commit makes a small correction to reduce Oceania to 20% region count relative to other regions from previous 33%.
---
 .../nextstrain-gisaid-21L/builds.yaml         | 172 +++++++++++------
 .../nextstrain-gisaid/builds.yaml             | 172 +++++++++++------
 .../nextstrain-open/builds.yaml               | 176 ++++++++++++------
 3 files changed, 362 insertions(+), 158 deletions(-)

diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
index b6c8116aa..39a872a55 100644
--- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -389,33 +389,43 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for global region over 1m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -432,12 +442,22 @@ subsampling:
       group_by: "country week"
       max_sequences: 600
       min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
       max_sequences: 600
       min_date: "--min-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
       max_sequences: 600
@@ -455,33 +475,43 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 2m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -491,128 +521,166 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
       min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division week"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 6m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 50
+      max_sequences: 25
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over all-time
-  # 4000 total
-  # all regions equal except Oceania at 33%
+  # 4320 total (expect ~3200)
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_all_time:
     africa:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Africa'"
     asia:
       group_by: "country year month"
-      max_sequences: 750
-      exclude: "--exclude-where 'region!=Asia'"
+      max_sequences: 600
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=China'"
     europe:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Europe'"
+    india:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=India'"
     north_america:
       group_by: "division year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=North America'"
     south_america:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=South America'"
     oceania:
       group_by: "division year month"
-      max_sequences: 250
+      max_sequences: 120
       exclude: "--exclude-where 'region!=Oceania'"
 
 # Root to clade 21L
diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index 751b0bc4a..deb8de43e 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -381,33 +381,43 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for global region over 1m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -424,12 +434,22 @@ subsampling:
       group_by: "country week"
       max_sequences: 600
       min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
       max_sequences: 600
       min_date: "--min-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
       max_sequences: 600
@@ -447,33 +467,43 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 2m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -483,128 +513,166 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
       min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division week"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 6m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 50
+      max_sequences: 25
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over all-time
-  # 4000 total
-  # all regions equal except Oceania at 33%
+  # 4320 total (expect ~3200)
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_all_time:
     africa:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Africa'"
     asia:
       group_by: "country year month"
-      max_sequences: 750
-      exclude: "--exclude-where 'region!=Asia'"
+      max_sequences: 600
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=China'"
     europe:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Europe'"
+    india:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=India'"
     north_america:
       group_by: "division year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=North America'"
     south_america:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=South America'"
     oceania:
       group_by: "division year month"
-      max_sequences: 250
+      max_sequences: 120
       exclude: "--exclude-where 'region!=Oceania'"
 
 # if different traits should be reconstructed for some builds, specify here
diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml
index 25b32ea57..2d0eabeab 100644
--- a/nextstrain_profiles/nextstrain-open/builds.yaml
+++ b/nextstrain_profiles/nextstrain-open/builds.yaml
@@ -381,33 +381,43 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for global region over 1m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
       max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -424,15 +434,25 @@ subsampling:
       group_by: "country week"
       max_sequences: 600
       min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
-      max_sequences: 1500
+      max_sequences: 600
       min_date: "--min-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 1M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
-      max_sequences: 1500
+      max_sequences: 600
       min_date: "--min-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
@@ -447,33 +467,43 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 2m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
       max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
@@ -483,128 +513,166 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division week"
+      max_sequences: 500
+      min_date: "--min-date 2M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country week"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division week"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 6m
-  # 4000 total
-  # 4:1 ratio of focal to context
-  # all regions equal except Oceania at 33%
+  # 5125 total (expect ~3400)
+  # 4:1 ratio of recent to early
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_early:
+      group_by: "division year month"
+      max_sequences: 125
+      max_date: "--max-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 125
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 50
+      max_sequences: 25
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia'"
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=China'"
     europe_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
+    india_recent:
+      group_by: "division year month"
+      max_sequences: 500
+      min_date: "--min-date 6M"
+      exclude: "--exclude-where 'country!=India'"
     north_america_recent:
       group_by: "division year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_recent:
       group_by: "country year month"
-      max_sequences: 600
+      max_sequences: 500
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_recent:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 100
       min_date: "--min-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over all-time
-  # 4000 total
-  # all regions equal except Oceania at 33%
+  # 4320 total (expect ~3200)
+  # all eight regions equal except Oceania at 20%
   nextstrain_global_all_time:
     africa:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Africa'"
     asia:
       group_by: "country year month"
-      max_sequences: 750
-      exclude: "--exclude-where 'region!=Asia'"
+      max_sequences: 600
+      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
+    china:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=China'"
     europe:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=Europe'"
+    india:
+      group_by: "division year month"
+      max_sequences: 600
+      exclude: "--exclude-where 'country!=India'"
     north_america:
       group_by: "division year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=North America'"
     south_america:
       group_by: "country year month"
-      max_sequences: 750
+      max_sequences: 600
       exclude: "--exclude-where 'region!=South America'"
     oceania:
       group_by: "division year month"
-      max_sequences: 250
+      max_sequences: 120
       exclude: "--exclude-where 'region!=Oceania'"
 
 # GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build