microsoft · lekurile · Sep 18, 2023 · Jul 17, 2023 · Jul 19, 2023 · Jul 19, 2023
@@ -11,6 +11,7 @@
 from .gptneo import DS_GPTNEOContainer, HFGPTNEOLayerPolicy
 from .gptneox import DS_GPTNEOXContainer, GPTNEOXLayerPolicy
 from .llama import DS_LLAMAContainer, LLAMALayerPolicy
+from .internlm import DS_InternLMContainer, InternLMLayerPolicy
 from .megatron_gpt import DS_MegatronGPTContainer, MegatronLayerPolicy
 from .megatron_gpt_moe import DS_MegatronGPTMoEContainer, MegatronMoELayerPolicy
 from .opt import DS_OPTContainer, HFOPTLayerPolicy

@@ -0,0 +1,183 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright (c) wangruohui
+
+import importlib
+
+import torch
+from torch.nn.parameter import Parameter
+
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+from deepspeed.utils.types import ActivationFuncType, NormType
+
+from ..policy import (TransformerPolicy, maybe_copy, maybe_copy_geglu, maybe_copy_qkv, maybe_get_lora,
+                      transformer_param_names)
+from .base import *
+from .features import HybridGatedMLPContainer, HybridSplitQKVContainer
+
+
+class DS_InternLMContainer(HybridGatedMLPContainer, HybridSplitQKVContainer, BaseTransformerContainer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        _config.rotate_half = True
+        _config.rotate_every_two = False
+        _config.rotary_dim = self.hidden_size // self.num_attention_heads
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+
+        return self.module
+
+    def set_lora_params(self):
+        """
+        Necessary to implement for `HybridEngineContainer`
+        """
+        self.lora_params = [
+            maybe_get_lora(p) for p in [
+                self.policy.client_module.mlp.up_proj.weight, self.policy.client_module.mlp.gate_proj.weight,
+                self.policy.client_module.mlp.down_proj.weight, self.policy.client_module.self_attn.q_proj.weight,
+                self.policy.client_module.self_attn.k_proj.weight, self.policy.client_module.self_attn.v_proj.weight,
+                self.policy.client_module.self_attn.o_proj.weight
+            ]
+        ]
+
+    def get_lora_matched_pair(self):
+        up_proj_lora, gate_proj_lora, down_proj_lora, q_lora, k_lora, v_lora, out_lora = self.get_lora_params()
+        ret = [(up_proj_lora, self.inter_up_w), (gate_proj_lora, self.inter_gate_w), (down_proj_lora, self._4hh_w),
+               (out_lora, self.dense_w), (q_lora, self.qw), (k_lora, self.kw), (v_lora, self.vw)]
+        return ret
+
+    def set_q_k_v(self):
+        """
+        Necessary to implement for `HybridSplitQKVContainer`
+        """
+        self.qw = self.policy.client_module.self_attn.q_proj.weight
+        self.qb = self.policy.client_module.self_attn.q_proj.bias
+        self.kw = self.policy.client_module.self_attn.k_proj.weight
+        self.kb = self.policy.client_module.self_attn.k_proj.bias
+        self.vw = self.policy.client_module.self_attn.v_proj.weight
+        self.vb = self.policy.client_module.self_attn.v_proj.bias
+
+    def set_mlp_gate(self):
+        """
+        Necessary to implement for `HybridGatedMLPContainer`
+        """
+        self.inter_up_w = self.policy.client_module.mlp.up_proj.weight
+        self.inter_up_b = None
+        self.inter_gate_w = self.policy.client_module.mlp.gate_proj.weight
+        self.inter_gate_b = None
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attn.q_proj.weight', \
+            'self_attn.k_proj.weight', \
+            'self_attn.v_proj.weight', \
+            'self_attn.o_proj.weight', \
+            'mlp.up_proj.weight', \
+            'mlp.gate_proj.weight', \
+            'mlp.down_proj.weight', \
+            'input_layernorm.weight', \
+            'post_attention_layernorm.weight'
+            'self_attn.q_proj.bias', \
+            'self_attn.k_proj.bias', \
+            'self_attn.v_proj.bias', \
+            'self_attn.o_proj.bias', \
+        )
+
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        maybe_copy_qkv(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       'attn_qkvb', [prefix + param_names[9], prefix + param_names[10], prefix + param_names[11]],
+                       split_qkv=self.policy.split_qkv)
+        maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[2],
+                   prefix + param_names[3])
+        maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[3],
+                   prefix + param_names[12])
+        maybe_copy_geglu(module.mlp, sd, weight_quantizer, mp_replace, 'inter_w',
+                         [prefix + param_names[4], prefix + param_names[5]])
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, 'output_w', prefix + param_names[6])
+
+        maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[8], prefix + param_names[7])
+        maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[10], prefix + param_names[8])
+
+
+class InternLMLayerPolicy(TransformerPolicy):
+    _orig_layer_class = []
+    _orig_layer_class_inited = False
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(
+            inference,
+            mlp_act_func_type=ActivationFuncType.GATED_SILU,
+            norm_type=NormType.RMSNorm,
+        )
+        self.client_module = client_module
+
+        self._init_orig_layer_class_once()
+
+    def _init_orig_layer_class_once(self):
+        if InternLMLayerPolicy._orig_layer_class_inited:
+            return
+
+        for sub_pkg in ['', '.internlm-7b', '.internlm-chat-7b']:
+            try:
+                from transformers.utils import TRANSFORMERS_DYNAMIC_MODULE_NAME
+                module = importlib.import_module(f"{TRANSFORMERS_DYNAMIC_MODULE_NAME}{sub_pkg}.modeling_internlm")
+                if module.InternLMDecoderLayer not in InternLMLayerPolicy._orig_layer_class:
+                    InternLMLayerPolicy._orig_layer_class.append(module.InternLMDecoderLayer)
+            except ImportError:
+                continue
+
+        InternLMLayerPolicy._orig_layer_class_inited = True
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attn.q_proj.weight.shape[1], \
+                self.client_module.self_attn.num_heads, \
+                self.client_module.input_layernorm.variance_epsilon, \
+                self.client_module.mlp.gate_proj.weight.shape[0]
+
+    def attention(self, enable_training=False):
+        qw = self.client_module.self_attn.q_proj.weight
+        kw = self.client_module.self_attn.k_proj.weight
+        vw = self.client_module.self_attn.v_proj.weight
+        qb = self.client_module.self_attn.q_proj.bias
+        kb = self.client_module.self_attn.k_proj.bias
+        vb = self.client_module.self_attn.v_proj.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
+
+        return qkvw, \
+                qkvb, \
+                self.client_module.self_attn.o_proj.weight, \
+                self.client_module.self_attn.o_proj.bias
+
+    def mlp(self, enable_training=False):
+        mlp1_up = self.client_module.mlp.up_proj.weight
+        mlp1_gate = self.client_module.mlp.gate_proj.weight
+        mlp2 = self.client_module.mlp.down_proj.weight
+
+        mlp1 = Parameter(torch.cat((mlp1_up, mlp1_gate), dim=0), requires_grad=enable_training)
+
+        return mlp1, None, mlp2, None
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               None, \
+               self.client_module.input_layernorm.weight, \
+               None
@@ -16,11 +16,13 @@
 from .containers import LLAMALayerPolicy
 from .containers import UNetPolicy
 from .containers import VAEPolicy
+from .containers import InternLMLayerPolicy
 
 # transformer-based policies
 replace_policies = [
     HFBertLayerPolicy, HFGPTNEOLayerPolicy, GPTNEOXLayerPolicy, HFGPTJLayerPolicy, MegatronLayerPolicy,
-    HFGPT2LayerPolicy, BLOOMLayerPolicy, HFOPTLayerPolicy, HFCLIPLayerPolicy, HFDistilBertLayerPolicy, LLAMALayerPolicy
+    HFGPT2LayerPolicy, BLOOMLayerPolicy, HFOPTLayerPolicy, HFCLIPLayerPolicy, HFDistilBertLayerPolicy,
+    LLAMALayerPolicy, InternLMLayerPolicy
 ]
 
 # non-transformer-based policies

@@ -18,6 +18,7 @@ def policy_to_ds_container(**kwargs):
     from .containers import MegatronLayerPolicy, DS_MegatronGPTContainer
     from .containers import HFDistilBertLayerPolicy, DS_DistilBERTContainer
     from .containers import LLAMALayerPolicy, DS_LLAMAContainer
+    from .containers import InternLMLayerPolicy, DS_InternLMContainer
 
     policy_to_container = {
         HFGPT2LayerPolicy: DS_GPT2Container,
@@ -30,6 +31,7 @@ def policy_to_ds_container(**kwargs):
         MegatronLayerPolicy: DS_MegatronGPTContainer,
         HFDistilBertLayerPolicy: DS_DistilBERTContainer,
         LLAMALayerPolicy: DS_LLAMAContainer,
+        InternLMLayerPolicy: DS_InternLMContainer,
     }
 
     container = None

@@ -159,6 +159,7 @@ def forward(self,
                                                                        input_mask=input_mask,
                                                                        layer_past=layer_past,
                                                                        alibi=alibi)
+
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
         inp_norm = qkv_out[-1]
 

@@ -85,6 +85,8 @@ def forward(self,
                 self.config.mlp_act_func_type,
                 self.config.transposed_mode)
         else:
+            if input_bias is not None:
+                input += input_bias
             output, residual_add = self.mlp_gemm_func(
                 input,
                 residual,

@@ -84,5 +84,7 @@ def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor,
         else:
             output, norm = self.qkv_gemm_func(input, weight, q_scale, gamma, self.config.epsilon, q_int8,
                                               self.config.transposed_mode)
+            if add_bias:
+                output += bias
 
         return output, norm