rename is_postnorm to is_postscore (#107)

microsoft · Feb 26, 2022 · bddc915 · bddc915
1 parent 712bf2e
commit bddc915
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -99,11 +99,14 @@ Usage of MOELayer:
                               or a list of dict-type gate descriptions, e.g. [{'type': 'top', 'k', 2}, {'type': 'top', 'k', 2}],
                               the value of k in top-gating can be also negative, like -2, which indicates one GPU will hold 1/(-k) parameters of an expert
         model_dim        : the number of channels for MOE's input tensor
-        experts          : a dict-type config for builtin expert network, or a torch.nn.Module-type custom expert network
+        experts          : a dict-type config for builtin expert network
         scan_expert_func : allow users to specify a lambda function to iterate each experts param, e.g. `scan_expert_func = lambda name, param: setattr(param, 'expert', True)`
         result_func      : allow users to specify a lambda function to format the MoE output and aux_loss, e.g. `result_func = lambda output: (output, output.l_aux)`
         group            : specify the explicit communication group of all_to_all
         seeds            : a tuple containing a tripple of int to specify manual seed of (shared params, local params, others params after MoE's)
+        a2a_ffn_overlap_degree : the value to control a2a overlap depth, 1 by default for no overlap, 2 for overlap a2a with half gemm, ..
+        parallel_type    : the parallel method to compute MoE, valid types: 'auto', 'data', 'model'
+        pad_samples      : whether do auto padding on newly-coming input data to maximum data size in history
 
 * Usage of dict-type Experts Config:
 

diff --git a/tutel/impls/communicate.py b/tutel/impls/communicate.py
@@ -150,8 +150,12 @@ def init(group: dist.ProcessGroup, num_split: int, split_dim: int) -> None:
 
 
 class AllToAll(torch.autograd.Function):
+    _use_builtins = False
+
     @staticmethod
     def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor):
+        AllToAll._use_builtins = True
+
         ctx.group = group
         world_size = get_world_size(group)
         if world_size <= 1:

diff --git a/tutel/impls/fast_dispatch.py b/tutel/impls/fast_dispatch.py
@@ -89,13 +89,13 @@ def __init__(self, num_global_experts, capacity, model_dim, dispatch_dtype):
         self.original_dtype = dispatch_dtype
         self.aligned_dim = model_dim // (2 if self.dtype == torch.float16 else 1)
 
-    def update(self, indices_, locations_, gates_, capacity=None, is_postnorm=True):
+    def update(self, indices_, locations_, gates_, capacity=None, is_postscore=True):
         self.indices_ = [x.to(torch.int32).view(-1) for x in indices_]
         self.locations_ = [x.to(torch.int32) for x in locations_]
         self.gates_ = [x.to(self.dtype) for x in gates_]
         sample_size = int(self.indices_[0].size(0))
         capacity = int(capacity) or self.capacity
-        self.is_postnorm = is_postnorm
+        self.is_postscore = is_postscore
 
         if sample_size != self.expected_sample_size or capacity != self.capacity:
             self.expected_sample_size, self.capacity = sample_size, capacity
@@ -109,13 +109,13 @@ def update(self, indices_, locations_, gates_, capacity=None, is_postnorm=True):
                 self.func_fwd, self.func_bwd_data, self.func_bwd_gate, self.ones_helper = self.kernel_pool[tuple((sample_size, capacity))]
 
     def encode(self, data):
-        if self.is_postnorm:
+        if self.is_postscore:
             return GatingEncoder.apply(self, data.to(self.dtype)).to(self.original_dtype)
         else:
             return GatingEncoder.apply(self, data.to(self.dtype), *self.gates_).to(self.original_dtype)
 
     def decode(self, data):
-        if self.is_postnorm:
+        if self.is_postscore:
             return GatingDecoder.apply(self, data.to(self.dtype), *self.gates_).to(self.original_dtype)
         else:
             return GatingDecoder.apply(self, data.to(self.dtype)).to(self.original_dtype)

diff --git a/tutel/impls/moe_layer.py b/tutel/impls/moe_layer.py
@@ -48,20 +48,22 @@ def __init__(
         num_global_experts,
         a2a_ffn_overlap_degree=1,
         capacity_factor=1.0,
-        top_k=2,
+        k=2,
         batch_prioritized_routing=False,
-        **kwargs,
+        fp32_gate=False,
+        is_postscore=True,
+        input_dropout_p=0,
     ):
         super().__init__()
-        top_k = min(top_k, num_global_experts)
-        self.top_k = top_k
+        k = min(k, num_global_experts)
+        self.top_k = k
         assert self.top_k > 0, "Top-k value %d is not valid." % self.top_k
 
         self.wg = torch.nn.Linear(model_dim, num_global_experts, bias=False)
 
-        self.fp32_gate = kwargs.get('fp32_gate', False)
+        self.fp32_gate = fp32_gate
         if self.fp32_gate:
-          self.wg = self.wg.float()
+            self.wg = self.wg.float()
 
         self.capacity_factor = float(os.environ.get('CAP_FACTOR', capacity_factor))
         self.is_ones_gate = (int(os.environ.get('ONES_GATE', 0)) == 1)
@@ -71,8 +73,7 @@ def __init__(
         if int(os.environ.get('BATCH_PRIO', 0)) != 0:
             self.batch_prioritized_routing = True
 
-        self.is_postnorm = kwargs.get('is_postnorm', True)
-        input_dropout_p = kwargs.get('input_dropout_p', 0)
+        self.is_postscore = is_postscore
         self.input_dropout = torch.nn.Dropout(p=input_dropout_p) if input_dropout_p else None
 
         self.a2a_ffn_overlap_degree = a2a_ffn_overlap_degree
@@ -134,7 +135,7 @@ def apply_on_expert_fn(self, input, ctx):
 
         if self.is_ones_gate:
             gates_s = [torch.ones_like(x) for x in gates_s]
-        self._fdr.update(indices_s, locations_s, gates_s, capacity=capacity, is_postnorm=self.is_postnorm)
+        self._fdr.update(indices_s, locations_s, gates_s, capacity=capacity, is_postscore=self.is_postscore)
 
         dispatched_input = self._fdr.encode(input)
 
@@ -223,7 +224,19 @@ class MOELayer(torch.nn.Module):
     """Tutel optimized MOELayer
     """
 
-    def __init__(self, gate_type, model_dim: int, experts = None, scan_expert_func = None, result_func = None, group: Optional[Any] = None, seeds = None, a2a_ffn_overlap_degree = 1, **kwargs):
+    def __init__(
+        self,
+        gate_type,
+        model_dim: int,
+        experts=None,
+        scan_expert_func=None,
+        result_func=None,
+        group=None,
+        seeds=None,
+        a2a_ffn_overlap_degree=1,
+        parallel_type='auto',
+        pad_samples=False,
+    ):
         super().__init__()
         assert model_dim % 2 == 0, "Model_dim (%s) must be even value, while this Model_dim mod 2 > 0." % model_dim
         group = group or dist.group.WORLD
@@ -257,7 +270,6 @@ def __init__(self, gate_type, model_dim: int, experts = None, scan_expert_func =
             self.num_global_experts = num_devices * self.num_local_experts
             sharded_count = 1
 
-        parallel_type = kwargs.get('parallel_type', 'auto')
         if sharded_count == 1 or not self.is_builtin_experts:
             self.auto_parallel, self.use_model_parallel = False, False
         elif parallel_type == 'auto':
@@ -413,11 +425,9 @@ def to(self, *args, **kwargs):
             if single_gate_type['type'] == 'top':
                 if seeds is not None and seeds[0] is not None:
                     torch.manual_seed(seeds[0] + gi)
-                if "fp32_gate" in kwargs:
-                    logging.warning(f'`fp32_gate` option in tutel.moe_layer has been deprecated, please move this option to gate_type = {{.., "fp32_gate": {kwargs["fp32_gate"]}}} instead.')
-                    single_gate_type["fp32_gate"] = kwargs["fp32_gate"]
 
-                self.gates += [TopKGate(model_dim=model_dim, top_k=single_gate_type['k'], num_global_experts=self.num_global_experts, a2a_ffn_overlap_degree=a2a_ffn_overlap_degree, **single_gate_type)]
+                single_gate_type.pop('type')
+                self.gates += [TopKGate(model_dim=model_dim, num_global_experts=self.num_global_experts, a2a_ffn_overlap_degree=a2a_ffn_overlap_degree, **single_gate_type)]
             else:
                 raise Exception("Unrecognized gate_type: %s" % single_gate_type)
 
@@ -435,7 +445,7 @@ def expert_fn(dispatched_input):
             return expert_output
 
         self.expert_fn = expert_fn
-        self.expected_sample_size = 0 if kwargs.get('scale_samples', False) else -1
+        self.expected_sample_size = 0 if pad_samples else -1
 
     def get_parameter_iterator(self, param_type):
         if param_type == 'gate':
@@ -445,7 +455,7 @@ def get_parameter_iterator(self, param_type):
         else:
             raise Exception("Specified parameter type is not recognized: %s. Valid `param_type` includes: gate, local_experts." % param_type)
 
-    def forward(self, input: Tensor, gate_index=0, **kwargs: Any):
+    def forward(self, input: Tensor, gate_index=0):
         if self.skip_moe:
             result_output = input
             result_output.l_aux = None

diff --git a/tutel/system_init.py b/tutel/system_init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import os
+import os, sys
 import re
 import logging
 
@@ -25,12 +25,19 @@ def init_affinity_at_program_beginning():
             logging.warning('Failed to set NUMA status: %s' % ex)
 
 def init_data_model_parallel(group_count=1, backend='nccl'):
-    from tutel.impls.communicate import create_groups_from_world
-    result = create_groups_from_world(group_count=group_count, include_init=backend)
+    from tutel.impls import communicate as C
+    result = C.create_groups_from_world(group_count=group_count, include_init=backend)
+    logging.critical(f'Registering device global rank {result.global_rank}: data_rank = {result.data_rank}, model_rank = {result.model_rank}')
+
+    def on_quit():
+        sys.stdout.flush()
+        sys.stderr.flush()
+        # Builtin dist.all_to_all_single in torch is unstable in some versions.
+        # Temp work around: https://github.com/pytorch/pytorch/issues/56390
+        if C.AllToAll._use_builtins:
+            os._exit(0)
 
-    # Temp work around for: https://github.com/pytorch/pytorch/issues/56390
     import atexit
-    atexit.register(lambda *args: os._exit(0))
+    atexit.register(lambda *args: on_quit())
 
-    logging.critical(f'Registering device global rank {result.global_rank}: data_rank = {result.data_rank}, model_rank = {result.model_rank}')
     return result