Fix a bug in saving vocab during distributed training (allenai#4705)

* is_master bug fix * changelog * remove is_master * changelog
matt-gardner · Oct 5, 2020 · edcb6d3 · edcb6d3
1 parent 3506e3f
commit edcb6d3
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 43 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - When `cached_path` is called on a local archive with `extract_archive=True`, the archive is now extracted into a unique subdirectory of the cache root instead of a subdirectory of the archive's directory. The extraction directory is also unique to the modification time of the archive, so if the file changes, subsequent calls to `cached_path` will know to re-extract the archive.
 - Removed the `truncation_strategy` parameter to `PretrainedTransformerTokenizer`. The way we're calling the tokenizer, the truncation strategy takes no effect anyways.
 
+### Removed
+
+- Removed `common.util.is_master` function.
+
 ### Fixed
 
 - Class decorators now displayed in API docs.
@@ -55,7 +59,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   at around the 9 minute mark.
 - Fixed a bug where parameters to a `FromParams` class that are dictionaries wouldn't get logged
   when an instance is instantiated `from_params`.
-
+- Fixed a bug in distributed training where the vocab would be saved from every worker, when it should have been saved by only the local master process.
 
 ## [v1.1.0](https://github.com/allenai/allennlp/releases/tag/v1.1.0) - 2020-09-08
 

diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
@@ -374,7 +374,6 @@ def _train_worker(
 
     distributed = world_size > 1
 
-    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
     master = process_rank == 0
 
     include_package = include_package or []
@@ -652,7 +651,7 @@ def from_partial_objects(
         # case, we're trivially the master. In the distributed case this is safe
         # to do without worrying about race conditions since saving and loading
         # the vocab involves acquiring a file lock.
-        if common_util.is_master():
+        if local_rank == 0:
             vocabulary_path = os.path.join(serialization_dir, "vocabulary")
             vocabulary_.save_to_files(vocabulary_path)
 

diff --git a/allennlp/common/util.py b/allennlp/common/util.py
@@ -492,46 +492,6 @@ def flatten_filename(file_path: str) -> str:
     return file_path.replace("/", "_SLASH_")
 
 
-def is_master(
-    global_rank: int = None, world_size: int = None, num_procs_per_node: int = None
-) -> bool:
-    """
-    Checks if the process is a "master" of its node in a distributed process group. If a
-    process group is not initialized, this returns `True`.
-
-    # Parameters
-
-    global_rank : `int` ( default = `None` )
-        Global rank of the process if in a distributed process group. If not
-        given, rank is obtained using `torch.distributed.get_rank()`
-    world_size : `int` ( default = `None` )
-        Number of processes in the distributed group. If not
-        given, this is obtained using `torch.distributed.get_world_size()`
-    num_procs_per_node: `int` ( default = `None` )
-        Number of GPU processes running per node
-    """
-
-    # In non-distributed case, a "master" process doesn't make any
-    # sense. So instead of raising an error, returning True would
-    # make things less painful
-    if not is_distributed():
-        return True
-
-    if global_rank is None:
-        global_rank = dist.get_rank()
-
-    if world_size is None:
-        world_size = dist.get_world_size()
-
-    if num_procs_per_node is None and os.environ:
-        num_procs_per_node = int(os.environ.get("ALLENNLP_PROCS_PER_NODE", world_size))
-
-    # rank == 0 would do in a single-node multi-GPU setup. However,
-    # in a multi-node case, every node has a logical master and hence
-    # the mod(%) op.
-    return global_rank % (world_size / num_procs_per_node) == 0
-
-
 def is_distributed() -> bool:
     """
     Checks if the distributed process group is available and has been initialized