From 52501d66ba2aaa3774245a8711cad2c323269747 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 11 Apr 2024 17:11:33 -0700 Subject: [PATCH] Use weighted sampling --- generate-subsampling-config.py | 137 +++++++----------------- subsampling/africa_1m.yaml | 9 +- subsampling/africa_2m.yaml | 9 +- subsampling/africa_6m.yaml | 9 +- subsampling/africa_all-time.yaml | 5 +- subsampling/asia_1m.yaml | 17 +-- subsampling/asia_2m.yaml | 17 +-- subsampling/asia_6m.yaml | 17 +-- subsampling/asia_all-time.yaml | 9 +- subsampling/europe_1m.yaml | 9 +- subsampling/europe_2m.yaml | 9 +- subsampling/europe_6m.yaml | 9 +- subsampling/europe_all-time.yaml | 5 +- subsampling/global_1m.yaml | 33 +++--- subsampling/global_2m.yaml | 33 +++--- subsampling/global_6m.yaml | 33 +++--- subsampling/global_all-time.yaml | 17 +-- subsampling/nextstrain_ci_sampling.yaml | 5 +- subsampling/north-america_1m.yaml | 9 +- subsampling/north-america_2m.yaml | 9 +- subsampling/north-america_6m.yaml | 9 +- subsampling/north-america_all-time.yaml | 5 +- subsampling/oceania_1m.yaml | 9 +- subsampling/oceania_2m.yaml | 9 +- subsampling/oceania_6m.yaml | 9 +- subsampling/oceania_all-time.yaml | 5 +- subsampling/reference.yaml | 3 +- subsampling/south-america_1m.yaml | 9 +- subsampling/south-america_2m.yaml | 9 +- subsampling/south-america_6m.yaml | 9 +- subsampling/south-america_all-time.yaml | 5 +- 31 files changed, 227 insertions(+), 254 deletions(-) diff --git a/generate-subsampling-config.py b/generate-subsampling-config.py index bf499e9c0..900094959 100644 --- a/generate-subsampling-config.py +++ b/generate-subsampling-config.py @@ -14,7 +14,7 @@ class Sample: group_by: Optional[List[str]] - size: Optional[int] + weight: Optional[int] min_date: Optional[str] max_date: Optional[str] excludes: Optional[List[str]] @@ -38,8 +38,8 @@ def to_dict(self): if self.group_by: options['group_by'] = self.group_by - if self.size: - options['max_sequences'] = self.size + if self.weight: + options['weight'] = self.weight if self.min_date: options['min_date'] = self.min_date @@ -57,9 +57,15 @@ def to_dict(self): class Config: - samples: List[Sample] - def __init__(self): - self.samples = [] + size: int + samples: Optional[List[Sample]] + + def __init__(self, size, samples=None): + if samples == None: + samples = [] + + self.size = size + self.samples = samples def add(self, new_sample: Sample): if any(new_sample.name == sample.name for sample in self.samples): @@ -69,13 +75,14 @@ def add(self, new_sample: Sample): def to_dict(self): return { + 'size': self.size, 'samples': { sample.name: sample.to_dict() for sample in self.samples } } def to_file(self, path): - print(f'Writing {path}. n={sum(sample.size for sample in self.samples)}') + print(f'Writing {path}.') with open(path, 'w') as f: yaml.dump(self.to_dict(), f, sort_keys=False) @@ -124,11 +131,9 @@ def write_region_time_builds(): build_name = f"{region.lower().replace(' ', '-')}_{time.lower()}" filename = Path(SUBSAMPLING_CONFIG_DIR, f"{build_name}.yaml") - config = Config() - # Global gets special treatment because it is not a region. if region == 'Global': - target_size = 5150 + config = Config(size=5150) locations = [ 'Africa', @@ -177,10 +182,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * weights[location] / sum_location_weights - ), + weight=weights[location], excludes=excludes[location], )) else: @@ -193,11 +195,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_EARLY / (WEIGHT_EARLY + WEIGHT_RECENT) - * weights[location] / sum_location_weights - ), + weight=(WEIGHT_EARLY * weights[location]), max_date=time, excludes=excludes[location], )) @@ -210,18 +208,14 @@ def write_region_time_builds(): GROUP_BY_GEOGRAPHICAL_RESOLUTION[location], GROUP_BY_RECENT_TEMPORAL_RESOLUTION[time], ], - size=int( - target_size - * WEIGHT_RECENT / (WEIGHT_EARLY + WEIGHT_RECENT) - * weights[location] / sum_location_weights - ), + weight=(WEIGHT_RECENT * weights[location]), min_date=time, excludes=excludes[location], )) # Asia gets special treatment because two countries must be weighted differently. elif region == 'Asia': - target_size = 4375 + config = Config(size=4375) locations = [ 'Asia', @@ -255,11 +249,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_FOCAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - * weights[location] / sum_location_weights - ), + weight=(WEIGHT_FOCAL * weights[location]), excludes=excludes[location], )) @@ -271,10 +261,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_CONTEXTUAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), + weight=WEIGHT_CONTEXTUAL * sum_location_weights, excludes=['region=Asia'], )) @@ -289,12 +276,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_EARLY / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_FOCAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - * weights[location] / sum_location_weights - ), + weight=(WEIGHT_EARLY * WEIGHT_FOCAL * weights[location]), max_date=time, excludes=excludes[location], )) @@ -307,11 +289,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_EARLY / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_CONTEXTUAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), + weight=(WEIGHT_EARLY * WEIGHT_CONTEXTUAL * sum_location_weights), max_date=time, excludes=['region=Asia'], )) @@ -327,12 +305,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_RECENT / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_FOCAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - * weights[location] / sum_location_weights - ), + weight=(WEIGHT_RECENT * WEIGHT_FOCAL * weights[location]), min_date=time, excludes=excludes[location], )) @@ -346,18 +319,14 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_RECENT / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_CONTEXTUAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), + weight=(WEIGHT_RECENT * WEIGHT_CONTEXTUAL * sum_location_weights), min_date=time, excludes=['region=Asia'], )) # Everything else is a "standard" region with dynamic geographical/temporal grouping. else: - target_size = 4000 + config = Config(size=4000) if time == 'all-time': # Focal sequences for region @@ -368,11 +337,8 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_FOCAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), - excludes=[f'region!={region}'] + weight=WEIGHT_FOCAL, + excludes=[f'region!={region}'], )) # Contextual sequences from the rest of the world @@ -383,11 +349,8 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_CONTEXTUAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), - excludes=[f'region={region}'] + weight=WEIGHT_CONTEXTUAL, + excludes=[f'region={region}'], )) else: # Early focal sequences for region @@ -398,11 +361,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_EARLY / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_FOCAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), + weight=(WEIGHT_EARLY * WEIGHT_FOCAL), max_date=time, excludes=[f'region!={region}'], )) @@ -415,11 +374,7 @@ def write_region_time_builds(): 'year', 'month', ], - size=int( - target_size - * WEIGHT_EARLY / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_CONTEXTUAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), + weight=(WEIGHT_EARLY * WEIGHT_CONTEXTUAL), max_date=time, excludes=[f'region={region}'], )) @@ -431,11 +386,7 @@ def write_region_time_builds(): GROUP_BY_GEOGRAPHICAL_RESOLUTION[region], GROUP_BY_RECENT_TEMPORAL_RESOLUTION[time], ], - size=int( - target_size - * WEIGHT_RECENT / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_FOCAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), + weight=(WEIGHT_RECENT * WEIGHT_FOCAL), min_date=time, excludes=[f'region!={region}'], )) @@ -447,35 +398,27 @@ def write_region_time_builds(): 'country', GROUP_BY_RECENT_TEMPORAL_RESOLUTION[time], ], - size=int( - target_size - * WEIGHT_RECENT / (WEIGHT_EARLY + WEIGHT_RECENT) - * WEIGHT_CONTEXTUAL / (WEIGHT_FOCAL + WEIGHT_CONTEXTUAL) - ), + weight=(WEIGHT_RECENT * WEIGHT_CONTEXTUAL), min_date=time, excludes=[f'region={region}'], )) - # Double check the total sample size. - total_size = sum(sample.size for sample in config.samples) - assert target_size == total_size - config.to_file(filename) def write_reference_build(): - config = Config() + config = Config(size=300) config.add(Sample( name='clades', group_by=['Nextstrain_clade'], - size=300, + weight=1, )) filename = Path(SUBSAMPLING_CONFIG_DIR, f"reference.yaml") config.to_file(filename) def write_ci_build(): - config = Config() + config = Config(size=30) config.add(Sample( name='region', group_by=[ @@ -483,9 +426,9 @@ def write_ci_build(): 'year', 'month', ], - size=20, + weight=2, disable_probabilistic_sampling=True, - excludes=['region!=Europe'] + excludes=['region!=Europe'], )) config.add(Sample( name='global', @@ -493,7 +436,7 @@ def write_ci_build(): 'year', 'month', ], - size=10, + weight=1, disable_probabilistic_sampling=True, excludes=['region=Europe'], # TODO: add Priority(type=proximity, focus=region) diff --git a/subsampling/africa_1m.yaml b/subsampling/africa_1m.yaml index 82b77d405..a1bf12e9e 100644 --- a/subsampling/africa_1m.yaml +++ b/subsampling/africa_1m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 1M exclude: - region!=Africa @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 1M exclude: - region=Africa @@ -21,7 +22,7 @@ samples: group_by: - country - week - max_sequences: 2560 + weight: 16 min_date: 1M exclude: - region!=Africa @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 1M exclude: - region=Africa diff --git a/subsampling/africa_2m.yaml b/subsampling/africa_2m.yaml index 794ea941a..ae7b79fe2 100644 --- a/subsampling/africa_2m.yaml +++ b/subsampling/africa_2m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 2M exclude: - region!=Africa @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 2M exclude: - region=Africa @@ -21,7 +22,7 @@ samples: group_by: - country - week - max_sequences: 2560 + weight: 16 min_date: 2M exclude: - region!=Africa @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 2M exclude: - region=Africa diff --git a/subsampling/africa_6m.yaml b/subsampling/africa_6m.yaml index d0c9cf907..1c0375919 100644 --- a/subsampling/africa_6m.yaml +++ b/subsampling/africa_6m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 6M exclude: - region!=Africa @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 6M exclude: - region=Africa @@ -21,7 +22,7 @@ samples: group_by: - country - month - max_sequences: 2560 + weight: 16 min_date: 6M exclude: - region!=Africa @@ -29,7 +30,7 @@ samples: group_by: - country - month - max_sequences: 640 + weight: 4 min_date: 6M exclude: - region=Africa diff --git a/subsampling/africa_all-time.yaml b/subsampling/africa_all-time.yaml index b8362000d..5aea3c3d5 100644 --- a/subsampling/africa_all-time.yaml +++ b/subsampling/africa_all-time.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal: group_by: - country - year - month - max_sequences: 3200 + weight: 4 exclude: - region!=Africa context: @@ -12,6 +13,6 @@ samples: - country - year - month - max_sequences: 800 + weight: 1 exclude: - region=Africa diff --git a/subsampling/asia_1m.yaml b/subsampling/asia_1m.yaml index 38f77ab2b..4c677cb7f 100644 --- a/subsampling/asia_1m.yaml +++ b/subsampling/asia_1m.yaml @@ -1,10 +1,11 @@ +size: 4375 samples: asia_early: group_by: - division - year - month - max_sequences: 300 + weight: 12 max_date: 1M exclude: - region!=Asia @@ -15,7 +16,7 @@ samples: - division - year - month - max_sequences: 200 + weight: 8 max_date: 1M exclude: - country!=China @@ -24,7 +25,7 @@ samples: - division - year - month - max_sequences: 200 + weight: 8 max_date: 1M exclude: - country!=India @@ -33,7 +34,7 @@ samples: - country - year - month - max_sequences: 175 + weight: 7 max_date: 1M exclude: - region=Asia @@ -42,7 +43,7 @@ samples: - division - year - month - max_sequences: 1200 + weight: 48 min_date: 1M exclude: - region!=Asia @@ -53,7 +54,7 @@ samples: - division - year - month - max_sequences: 800 + weight: 32 min_date: 1M exclude: - country!=China @@ -62,7 +63,7 @@ samples: - division - year - month - max_sequences: 800 + weight: 32 min_date: 1M exclude: - country!=India @@ -71,7 +72,7 @@ samples: - country - year - month - max_sequences: 700 + weight: 28 min_date: 1M exclude: - region=Asia diff --git a/subsampling/asia_2m.yaml b/subsampling/asia_2m.yaml index 25e1bbfb5..5ff8353fb 100644 --- a/subsampling/asia_2m.yaml +++ b/subsampling/asia_2m.yaml @@ -1,10 +1,11 @@ +size: 4375 samples: asia_early: group_by: - division - year - month - max_sequences: 300 + weight: 12 max_date: 2M exclude: - region!=Asia @@ -15,7 +16,7 @@ samples: - division - year - month - max_sequences: 200 + weight: 8 max_date: 2M exclude: - country!=China @@ -24,7 +25,7 @@ samples: - division - year - month - max_sequences: 200 + weight: 8 max_date: 2M exclude: - country!=India @@ -33,7 +34,7 @@ samples: - country - year - month - max_sequences: 175 + weight: 7 max_date: 2M exclude: - region=Asia @@ -42,7 +43,7 @@ samples: - division - year - month - max_sequences: 1200 + weight: 48 min_date: 2M exclude: - region!=Asia @@ -53,7 +54,7 @@ samples: - division - year - month - max_sequences: 800 + weight: 32 min_date: 2M exclude: - country!=China @@ -62,7 +63,7 @@ samples: - division - year - month - max_sequences: 800 + weight: 32 min_date: 2M exclude: - country!=India @@ -71,7 +72,7 @@ samples: - country - year - month - max_sequences: 700 + weight: 28 min_date: 2M exclude: - region=Asia diff --git a/subsampling/asia_6m.yaml b/subsampling/asia_6m.yaml index fe7c87116..1c618f114 100644 --- a/subsampling/asia_6m.yaml +++ b/subsampling/asia_6m.yaml @@ -1,10 +1,11 @@ +size: 4375 samples: asia_early: group_by: - division - year - month - max_sequences: 300 + weight: 12 max_date: 6M exclude: - region!=Asia @@ -15,7 +16,7 @@ samples: - division - year - month - max_sequences: 200 + weight: 8 max_date: 6M exclude: - country!=China @@ -24,7 +25,7 @@ samples: - division - year - month - max_sequences: 200 + weight: 8 max_date: 6M exclude: - country!=India @@ -33,7 +34,7 @@ samples: - country - year - month - max_sequences: 175 + weight: 7 max_date: 6M exclude: - region=Asia @@ -42,7 +43,7 @@ samples: - division - year - month - max_sequences: 1200 + weight: 48 min_date: 6M exclude: - region!=Asia @@ -53,7 +54,7 @@ samples: - division - year - month - max_sequences: 800 + weight: 32 min_date: 6M exclude: - country!=China @@ -62,7 +63,7 @@ samples: - division - year - month - max_sequences: 800 + weight: 32 min_date: 6M exclude: - country!=India @@ -71,7 +72,7 @@ samples: - country - year - month - max_sequences: 700 + weight: 28 min_date: 6M exclude: - region=Asia diff --git a/subsampling/asia_all-time.yaml b/subsampling/asia_all-time.yaml index 03902de86..1a9063129 100644 --- a/subsampling/asia_all-time.yaml +++ b/subsampling/asia_all-time.yaml @@ -1,10 +1,11 @@ +size: 4375 samples: asia: group_by: - division - year - month - max_sequences: 1500 + weight: 12 exclude: - region!=Asia - country=China @@ -14,7 +15,7 @@ samples: - division - year - month - max_sequences: 1000 + weight: 8 exclude: - country!=China india: @@ -22,7 +23,7 @@ samples: - division - year - month - max_sequences: 1000 + weight: 8 exclude: - country!=India context: @@ -30,6 +31,6 @@ samples: - country - year - month - max_sequences: 875 + weight: 7 exclude: - region=Asia diff --git a/subsampling/europe_1m.yaml b/subsampling/europe_1m.yaml index 7b6b13f22..f023fc520 100644 --- a/subsampling/europe_1m.yaml +++ b/subsampling/europe_1m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 1M exclude: - region!=Europe @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 1M exclude: - region=Europe @@ -21,7 +22,7 @@ samples: group_by: - country - week - max_sequences: 2560 + weight: 16 min_date: 1M exclude: - region!=Europe @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 1M exclude: - region=Europe diff --git a/subsampling/europe_2m.yaml b/subsampling/europe_2m.yaml index c6ae72450..e87979145 100644 --- a/subsampling/europe_2m.yaml +++ b/subsampling/europe_2m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 2M exclude: - region!=Europe @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 2M exclude: - region=Europe @@ -21,7 +22,7 @@ samples: group_by: - country - week - max_sequences: 2560 + weight: 16 min_date: 2M exclude: - region!=Europe @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 2M exclude: - region=Europe diff --git a/subsampling/europe_6m.yaml b/subsampling/europe_6m.yaml index 2cbe4910b..c96b24e01 100644 --- a/subsampling/europe_6m.yaml +++ b/subsampling/europe_6m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 6M exclude: - region!=Europe @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 6M exclude: - region=Europe @@ -21,7 +22,7 @@ samples: group_by: - country - month - max_sequences: 2560 + weight: 16 min_date: 6M exclude: - region!=Europe @@ -29,7 +30,7 @@ samples: group_by: - country - month - max_sequences: 640 + weight: 4 min_date: 6M exclude: - region=Europe diff --git a/subsampling/europe_all-time.yaml b/subsampling/europe_all-time.yaml index 1a1c0556d..f8c1eb0b0 100644 --- a/subsampling/europe_all-time.yaml +++ b/subsampling/europe_all-time.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal: group_by: - country - year - month - max_sequences: 3200 + weight: 4 exclude: - region!=Europe context: @@ -12,6 +13,6 @@ samples: - country - year - month - max_sequences: 800 + weight: 1 exclude: - region=Europe diff --git a/subsampling/global_1m.yaml b/subsampling/global_1m.yaml index 0518c8a7f..c44f90b4f 100644 --- a/subsampling/global_1m.yaml +++ b/subsampling/global_1m.yaml @@ -1,10 +1,11 @@ +size: 5150 samples: africa_early: group_by: - country - year - month - max_sequences: 150 + weight: 30 max_date: 1M exclude: - region!=Africa @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 200 + weight: 40 max_date: 1M exclude: - region!=Asia @@ -24,7 +25,7 @@ samples: - division - year - month - max_sequences: 175 + weight: 35 max_date: 1M exclude: - country!=China @@ -33,7 +34,7 @@ samples: - country - year - month - max_sequences: 125 + weight: 25 max_date: 1M exclude: - region!=Europe @@ -42,7 +43,7 @@ samples: - division - year - month - max_sequences: 175 + weight: 35 max_date: 1M exclude: - country!=India @@ -51,7 +52,7 @@ samples: - division - year - month - max_sequences: 100 + weight: 20 max_date: 1M exclude: - region!=North America @@ -60,7 +61,7 @@ samples: - country - year - month - max_sequences: 90 + weight: 18 max_date: 1M exclude: - region!=South America @@ -69,7 +70,7 @@ samples: - division - year - month - max_sequences: 15 + weight: 3 max_date: 1M exclude: - region!=Oceania @@ -77,7 +78,7 @@ samples: group_by: - country - week - max_sequences: 600 + weight: 120 min_date: 1M exclude: - region!=Africa @@ -85,7 +86,7 @@ samples: group_by: - country - week - max_sequences: 800 + weight: 160 min_date: 1M exclude: - region!=Asia @@ -95,7 +96,7 @@ samples: group_by: - division - week - max_sequences: 700 + weight: 140 min_date: 1M exclude: - country!=China @@ -103,7 +104,7 @@ samples: group_by: - country - week - max_sequences: 500 + weight: 100 min_date: 1M exclude: - region!=Europe @@ -111,7 +112,7 @@ samples: group_by: - division - week - max_sequences: 700 + weight: 140 min_date: 1M exclude: - country!=India @@ -119,7 +120,7 @@ samples: group_by: - division - week - max_sequences: 400 + weight: 80 min_date: 1M exclude: - region!=North America @@ -127,7 +128,7 @@ samples: group_by: - country - week - max_sequences: 360 + weight: 72 min_date: 1M exclude: - region!=South America @@ -135,7 +136,7 @@ samples: group_by: - division - week - max_sequences: 60 + weight: 12 min_date: 1M exclude: - region!=Oceania diff --git a/subsampling/global_2m.yaml b/subsampling/global_2m.yaml index 6691ec84b..54c247d98 100644 --- a/subsampling/global_2m.yaml +++ b/subsampling/global_2m.yaml @@ -1,10 +1,11 @@ +size: 5150 samples: africa_early: group_by: - country - year - month - max_sequences: 150 + weight: 30 max_date: 2M exclude: - region!=Africa @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 200 + weight: 40 max_date: 2M exclude: - region!=Asia @@ -24,7 +25,7 @@ samples: - division - year - month - max_sequences: 175 + weight: 35 max_date: 2M exclude: - country!=China @@ -33,7 +34,7 @@ samples: - country - year - month - max_sequences: 125 + weight: 25 max_date: 2M exclude: - region!=Europe @@ -42,7 +43,7 @@ samples: - division - year - month - max_sequences: 175 + weight: 35 max_date: 2M exclude: - country!=India @@ -51,7 +52,7 @@ samples: - division - year - month - max_sequences: 100 + weight: 20 max_date: 2M exclude: - region!=North America @@ -60,7 +61,7 @@ samples: - country - year - month - max_sequences: 90 + weight: 18 max_date: 2M exclude: - region!=South America @@ -69,7 +70,7 @@ samples: - division - year - month - max_sequences: 15 + weight: 3 max_date: 2M exclude: - region!=Oceania @@ -77,7 +78,7 @@ samples: group_by: - country - week - max_sequences: 600 + weight: 120 min_date: 2M exclude: - region!=Africa @@ -85,7 +86,7 @@ samples: group_by: - country - week - max_sequences: 800 + weight: 160 min_date: 2M exclude: - region!=Asia @@ -95,7 +96,7 @@ samples: group_by: - division - week - max_sequences: 700 + weight: 140 min_date: 2M exclude: - country!=China @@ -103,7 +104,7 @@ samples: group_by: - country - week - max_sequences: 500 + weight: 100 min_date: 2M exclude: - region!=Europe @@ -111,7 +112,7 @@ samples: group_by: - division - week - max_sequences: 700 + weight: 140 min_date: 2M exclude: - country!=India @@ -119,7 +120,7 @@ samples: group_by: - division - week - max_sequences: 400 + weight: 80 min_date: 2M exclude: - region!=North America @@ -127,7 +128,7 @@ samples: group_by: - country - week - max_sequences: 360 + weight: 72 min_date: 2M exclude: - region!=South America @@ -135,7 +136,7 @@ samples: group_by: - division - week - max_sequences: 60 + weight: 12 min_date: 2M exclude: - region!=Oceania diff --git a/subsampling/global_6m.yaml b/subsampling/global_6m.yaml index ed16452aa..6d6a712d5 100644 --- a/subsampling/global_6m.yaml +++ b/subsampling/global_6m.yaml @@ -1,10 +1,11 @@ +size: 5150 samples: africa_early: group_by: - country - year - month - max_sequences: 150 + weight: 30 max_date: 6M exclude: - region!=Africa @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 200 + weight: 40 max_date: 6M exclude: - region!=Asia @@ -24,7 +25,7 @@ samples: - division - year - month - max_sequences: 175 + weight: 35 max_date: 6M exclude: - country!=China @@ -33,7 +34,7 @@ samples: - country - year - month - max_sequences: 125 + weight: 25 max_date: 6M exclude: - region!=Europe @@ -42,7 +43,7 @@ samples: - division - year - month - max_sequences: 175 + weight: 35 max_date: 6M exclude: - country!=India @@ -51,7 +52,7 @@ samples: - division - year - month - max_sequences: 100 + weight: 20 max_date: 6M exclude: - region!=North America @@ -60,7 +61,7 @@ samples: - country - year - month - max_sequences: 90 + weight: 18 max_date: 6M exclude: - region!=South America @@ -69,7 +70,7 @@ samples: - division - year - month - max_sequences: 15 + weight: 3 max_date: 6M exclude: - region!=Oceania @@ -77,7 +78,7 @@ samples: group_by: - country - month - max_sequences: 600 + weight: 120 min_date: 6M exclude: - region!=Africa @@ -85,7 +86,7 @@ samples: group_by: - country - month - max_sequences: 800 + weight: 160 min_date: 6M exclude: - region!=Asia @@ -95,7 +96,7 @@ samples: group_by: - division - month - max_sequences: 700 + weight: 140 min_date: 6M exclude: - country!=China @@ -103,7 +104,7 @@ samples: group_by: - country - month - max_sequences: 500 + weight: 100 min_date: 6M exclude: - region!=Europe @@ -111,7 +112,7 @@ samples: group_by: - division - month - max_sequences: 700 + weight: 140 min_date: 6M exclude: - country!=India @@ -119,7 +120,7 @@ samples: group_by: - division - month - max_sequences: 400 + weight: 80 min_date: 6M exclude: - region!=North America @@ -127,7 +128,7 @@ samples: group_by: - country - month - max_sequences: 360 + weight: 72 min_date: 6M exclude: - region!=South America @@ -135,7 +136,7 @@ samples: group_by: - division - month - max_sequences: 60 + weight: 12 min_date: 6M exclude: - region!=Oceania diff --git a/subsampling/global_all-time.yaml b/subsampling/global_all-time.yaml index 6dd4cc5ae..02dcc124b 100644 --- a/subsampling/global_all-time.yaml +++ b/subsampling/global_all-time.yaml @@ -1,10 +1,11 @@ +size: 5150 samples: africa: group_by: - country - year - month - max_sequences: 750 + weight: 30 exclude: - region!=Africa asia: @@ -12,7 +13,7 @@ samples: - country - year - month - max_sequences: 1000 + weight: 40 exclude: - region!=Asia - country=China @@ -22,7 +23,7 @@ samples: - division - year - month - max_sequences: 875 + weight: 35 exclude: - country!=China europe: @@ -30,7 +31,7 @@ samples: - country - year - month - max_sequences: 625 + weight: 25 exclude: - region!=Europe india: @@ -38,7 +39,7 @@ samples: - division - year - month - max_sequences: 875 + weight: 35 exclude: - country!=India north_america: @@ -46,7 +47,7 @@ samples: - division - year - month - max_sequences: 500 + weight: 20 exclude: - region!=North America south_america: @@ -54,7 +55,7 @@ samples: - country - year - month - max_sequences: 450 + weight: 18 exclude: - region!=South America oceania: @@ -62,6 +63,6 @@ samples: - division - year - month - max_sequences: 75 + weight: 3 exclude: - region!=Oceania diff --git a/subsampling/nextstrain_ci_sampling.yaml b/subsampling/nextstrain_ci_sampling.yaml index 68796db1c..c4eff48d9 100644 --- a/subsampling/nextstrain_ci_sampling.yaml +++ b/subsampling/nextstrain_ci_sampling.yaml @@ -1,10 +1,11 @@ +size: 30 samples: region: group_by: - division - year - month - max_sequences: 20 + weight: 2 disable_probabilistic_sampling: true exclude: - region!=Europe @@ -12,7 +13,7 @@ samples: group_by: - year - month - max_sequences: 10 + weight: 1 disable_probabilistic_sampling: true exclude: - region=Europe diff --git a/subsampling/north-america_1m.yaml b/subsampling/north-america_1m.yaml index ef9467190..5d9d51633 100644 --- a/subsampling/north-america_1m.yaml +++ b/subsampling/north-america_1m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - division - year - month - max_sequences: 640 + weight: 4 max_date: 1M exclude: - region!=North America @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 1M exclude: - region=North America @@ -21,7 +22,7 @@ samples: group_by: - division - week - max_sequences: 2560 + weight: 16 min_date: 1M exclude: - region!=North America @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 1M exclude: - region=North America diff --git a/subsampling/north-america_2m.yaml b/subsampling/north-america_2m.yaml index 8e6687418..0b2254915 100644 --- a/subsampling/north-america_2m.yaml +++ b/subsampling/north-america_2m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - division - year - month - max_sequences: 640 + weight: 4 max_date: 2M exclude: - region!=North America @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 2M exclude: - region=North America @@ -21,7 +22,7 @@ samples: group_by: - division - week - max_sequences: 2560 + weight: 16 min_date: 2M exclude: - region!=North America @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 2M exclude: - region=North America diff --git a/subsampling/north-america_6m.yaml b/subsampling/north-america_6m.yaml index 12b01f18e..3bb4ed324 100644 --- a/subsampling/north-america_6m.yaml +++ b/subsampling/north-america_6m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - division - year - month - max_sequences: 640 + weight: 4 max_date: 6M exclude: - region!=North America @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 6M exclude: - region=North America @@ -21,7 +22,7 @@ samples: group_by: - division - month - max_sequences: 2560 + weight: 16 min_date: 6M exclude: - region!=North America @@ -29,7 +30,7 @@ samples: group_by: - country - month - max_sequences: 640 + weight: 4 min_date: 6M exclude: - region=North America diff --git a/subsampling/north-america_all-time.yaml b/subsampling/north-america_all-time.yaml index 596bb4151..2dafb262a 100644 --- a/subsampling/north-america_all-time.yaml +++ b/subsampling/north-america_all-time.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal: group_by: - division - year - month - max_sequences: 3200 + weight: 4 exclude: - region!=North America context: @@ -12,6 +13,6 @@ samples: - country - year - month - max_sequences: 800 + weight: 1 exclude: - region=North America diff --git a/subsampling/oceania_1m.yaml b/subsampling/oceania_1m.yaml index 708a38329..de615689c 100644 --- a/subsampling/oceania_1m.yaml +++ b/subsampling/oceania_1m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - division - year - month - max_sequences: 640 + weight: 4 max_date: 1M exclude: - region!=Oceania @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 1M exclude: - region=Oceania @@ -21,7 +22,7 @@ samples: group_by: - division - week - max_sequences: 2560 + weight: 16 min_date: 1M exclude: - region!=Oceania @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 1M exclude: - region=Oceania diff --git a/subsampling/oceania_2m.yaml b/subsampling/oceania_2m.yaml index 556d30dd2..346eb713f 100644 --- a/subsampling/oceania_2m.yaml +++ b/subsampling/oceania_2m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - division - year - month - max_sequences: 640 + weight: 4 max_date: 2M exclude: - region!=Oceania @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 2M exclude: - region=Oceania @@ -21,7 +22,7 @@ samples: group_by: - division - week - max_sequences: 2560 + weight: 16 min_date: 2M exclude: - region!=Oceania @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 2M exclude: - region=Oceania diff --git a/subsampling/oceania_6m.yaml b/subsampling/oceania_6m.yaml index b8ef10285..229b062c2 100644 --- a/subsampling/oceania_6m.yaml +++ b/subsampling/oceania_6m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - division - year - month - max_sequences: 640 + weight: 4 max_date: 6M exclude: - region!=Oceania @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 6M exclude: - region=Oceania @@ -21,7 +22,7 @@ samples: group_by: - division - month - max_sequences: 2560 + weight: 16 min_date: 6M exclude: - region!=Oceania @@ -29,7 +30,7 @@ samples: group_by: - country - month - max_sequences: 640 + weight: 4 min_date: 6M exclude: - region=Oceania diff --git a/subsampling/oceania_all-time.yaml b/subsampling/oceania_all-time.yaml index 27287e899..094f9c448 100644 --- a/subsampling/oceania_all-time.yaml +++ b/subsampling/oceania_all-time.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal: group_by: - division - year - month - max_sequences: 3200 + weight: 4 exclude: - region!=Oceania context: @@ -12,6 +13,6 @@ samples: - country - year - month - max_sequences: 800 + weight: 1 exclude: - region=Oceania diff --git a/subsampling/reference.yaml b/subsampling/reference.yaml index 6b83fa03c..26272772e 100644 --- a/subsampling/reference.yaml +++ b/subsampling/reference.yaml @@ -1,5 +1,6 @@ +size: 300 samples: clades: group_by: - Nextstrain_clade - max_sequences: 300 + weight: 1 diff --git a/subsampling/south-america_1m.yaml b/subsampling/south-america_1m.yaml index 341313c72..98c9bbb72 100644 --- a/subsampling/south-america_1m.yaml +++ b/subsampling/south-america_1m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 1M exclude: - region!=South America @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 1M exclude: - region=South America @@ -21,7 +22,7 @@ samples: group_by: - country - week - max_sequences: 2560 + weight: 16 min_date: 1M exclude: - region!=South America @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 1M exclude: - region=South America diff --git a/subsampling/south-america_2m.yaml b/subsampling/south-america_2m.yaml index fa90187ea..f9c1de7fa 100644 --- a/subsampling/south-america_2m.yaml +++ b/subsampling/south-america_2m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 2M exclude: - region!=South America @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 2M exclude: - region=South America @@ -21,7 +22,7 @@ samples: group_by: - country - week - max_sequences: 2560 + weight: 16 min_date: 2M exclude: - region!=South America @@ -29,7 +30,7 @@ samples: group_by: - country - week - max_sequences: 640 + weight: 4 min_date: 2M exclude: - region=South America diff --git a/subsampling/south-america_6m.yaml b/subsampling/south-america_6m.yaml index a6592bda1..8605bb273 100644 --- a/subsampling/south-america_6m.yaml +++ b/subsampling/south-america_6m.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal_early: group_by: - country - year - month - max_sequences: 640 + weight: 4 max_date: 6M exclude: - region!=South America @@ -13,7 +14,7 @@ samples: - country - year - month - max_sequences: 160 + weight: 1 max_date: 6M exclude: - region=South America @@ -21,7 +22,7 @@ samples: group_by: - country - month - max_sequences: 2560 + weight: 16 min_date: 6M exclude: - region!=South America @@ -29,7 +30,7 @@ samples: group_by: - country - month - max_sequences: 640 + weight: 4 min_date: 6M exclude: - region=South America diff --git a/subsampling/south-america_all-time.yaml b/subsampling/south-america_all-time.yaml index 5acb7afb1..ddad75bdb 100644 --- a/subsampling/south-america_all-time.yaml +++ b/subsampling/south-america_all-time.yaml @@ -1,10 +1,11 @@ +size: 4000 samples: focal: group_by: - country - year - month - max_sequences: 3200 + weight: 4 exclude: - region!=South America context: @@ -12,6 +13,6 @@ samples: - country - year - month - max_sequences: 800 + weight: 1 exclude: - region=South America