[Enchance] Support sync random seed for distributed sampler (#57)

* [Docs] update batch size * add sync seed * add sync seed * update comments
open-mmlab · Mar 30, 2022 · f871f3c · f871f3c
1 parent 1c4c270
commit f871f3c
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 9 deletions.
diff --git a/mmfewshot/utils/__init__.py b/mmfewshot/utils/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .collate import multi_pipeline_collate_fn
+from .dist_utils import check_dist_init, sync_random_seed
 from .infinite_sampler import (DistributedInfiniteGroupSampler,
                                DistributedInfiniteSampler,
                                InfiniteGroupSampler, InfiniteSampler)
@@ -11,5 +12,5 @@
     'multi_pipeline_collate_fn', 'local_numpy_seed',
     'InfiniteEpochBasedRunner', 'InfiniteSampler', 'InfiniteGroupSampler',
     'DistributedInfiniteSampler', 'DistributedInfiniteGroupSampler',
-    'get_root_logger'
+    'get_root_logger', 'check_dist_init', 'sync_random_seed'
 ]
diff --git a/mmfewshot/utils/dist_utils.py b/mmfewshot/utils/dist_utils.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+
+def check_dist_init():
+    return dist.is_available() and dist.is_initialized()
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Propagating the seed of rank 0 to all other ranks.
+
+    Make sure different ranks share the same seed. All workers must call
+    this function, otherwise it will deadlock. This method is generally used in
+    `DistributedSampler`, because the seed should be identical across all
+    processes in the distributed group.
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/mmfewshot/utils/infinite_sampler.py b/mmfewshot/utils/infinite_sampler.py
@@ -8,6 +8,8 @@
 from mmcv.runner import get_dist_info
 from torch.utils.data.sampler import Sampler
 
+from .dist_utils import sync_random_seed
+
 
 class InfiniteSampler(Sampler):
     """Return a infinite stream of index.
@@ -28,7 +30,13 @@ def __init__(self,
                  seed: int = 0,
                  shuffle: bool = True) -> None:
         self.dataset = dataset
-        self.seed = seed if seed is not None else 0
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
         self.size = len(dataset)
         self.indices = self._indices()
@@ -37,7 +45,7 @@ def __init__(self,
     def _infinite_indices(self) -> Iterator:
         """Infinitely yield a sequence of indices."""
         g = torch.Generator()
-        g.manual_seed(self.seed)
+        g.manual_seed(self.seed + self.epoch)
         while True:
             if self.shuffle:
                 yield from torch.randperm(self.size, generator=g).tolist()
@@ -89,7 +97,13 @@ def __init__(self,
                  shuffle: bool = True) -> None:
         self.dataset = dataset
         self.samples_per_gpu = samples_per_gpu
-        self.seed = seed if seed is not None else 0
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
 
         assert hasattr(self.dataset, 'flag')
@@ -105,7 +119,7 @@ def __init__(self,
     def _infinite_indices(self) -> Iterator:
         """Infinitely yield a sequence of indices."""
         g = torch.Generator()
-        g.manual_seed(self.seed)
+        g.manual_seed(self.seed + self.epoch)
         while True:
             if self.shuffle:
                 yield from torch.randperm(self.size, generator=g).tolist()
@@ -168,7 +182,13 @@ def __init__(self,
         self.rank = rank
         self.num_replicas = num_replicas
         self.dataset = dataset
-        self.seed = seed if seed is not None else 0
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
         self.size = len(dataset)
         self.indices = self._indices_of_rank()
@@ -177,7 +197,7 @@ def __init__(self,
     def _infinite_indices(self) -> Iterator:
         """Infinitely yield a sequence of indices."""
         g = torch.Generator()
-        g.manual_seed(self.seed)
+        g.manual_seed(self.seed + self.epoch)
         while True:
             if self.shuffle:
                 indices = []
@@ -244,7 +264,13 @@ def __init__(self,
         self.num_replicas = num_replicas
         self.dataset = dataset
         self.samples_per_gpu = samples_per_gpu
-        self.seed = seed if seed is not None else 0
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
         self.shuffle = shuffle
 
         assert hasattr(self.dataset, 'flag')
@@ -260,7 +286,7 @@ def __init__(self,
     def _infinite_indices(self) -> Iterator:
         """Infinitely yield a sequence of indices."""
         g = torch.Generator()
-        g.manual_seed(self.seed)
+        g.manual_seed(self.seed + self.epoch)
         while True:
             if self.shuffle:
                 indices = []