From 9733913ef7422ef78e89961c3d77f1aa1c07821b Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Wed, 7 Dec 2022 15:23:46 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90fluid=20api=20clear=E3=80=91Move=20bat?=
 =?UTF-8?q?ch=20norm1=20(#47965)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify slice infershape

* code style

* modify slice_unittest

* temp fix

* batch_norm api move

* code_style

* codestyle

* ci_static

* add __init__

* reset other change

* revert .cc

* add import batchnorm

* conflict and revert

* fix bug

* fix third conflict one day

* fix conflict

* fix conflict bug

* fix conflict bug

* modify api

* code_style

* modify doc

* add lost doc stable

* fix conflict bug

* ci lack of gpu
---
 python/paddle/fluid/contrib/layers/nn.py      |   5 +-
 .../fluid/contrib/slim/tests/test_graph.py    |   2 +-
 .../tests/test_quantization_mkldnn_pass.py    |   2 +-
 .../slim/tests/test_quantization_pass.py      |   6 +-
 .../tests/test_quantization_scale_pass.py     |   2 +-
 .../tests/test_user_defined_quantization.py   |   2 +-
 .../tests/test_image_classification_fp16.py   |   4 +-
 .../tests/test_multi_precision_fp16_train.py  |   2 +-
 .../contrib/tests/test_quantize_transpiler.py |   4 +-
 python/paddle/fluid/layers/nn.py              | 323 ------------------
 python/paddle/fluid/nets.py                   |   2 +-
 .../tests/book/test_image_classification.py   |   4 +-
 .../fluid/tests/book/test_recognize_digits.py |   2 +-
 .../fluid/tests/unittests/dist_se_resnext.py  |   2 +-
 .../unittests/ipu/test_batch_norm_op_ipu.py   |   2 +-
 .../ir/inference/test_trt_activation_pass.py  |   3 +-
 .../inference/test_trt_anchor_generator_op.py |   3 +-
 .../ir/inference/test_trt_elementwise_op.py   |   3 +-
 .../ir/inference/test_trt_flatten_op.py       |   5 +-
 .../ir/inference/test_trt_gather_nd_op.py     |   5 +-
 .../ir/inference/test_trt_inspector.py        |   2 +-
 .../ir/inference/test_trt_instance_norm_op.py |   6 +-
 .../unittests/ir/inference/test_trt_matmul.py |   7 +-
 .../test_trt_matmul_quant_dequant.py          |   4 +-
 .../inference/test_trt_multiclass_nms3_op.py  |   3 +-
 .../inference/test_trt_multiclass_nms_op.py   |   3 +-
 .../inference/test_trt_nearest_interp_op.py   |   3 +-
 .../test_trt_nearest_interp_v2_op.py          |   3 +-
 .../unittests/ir/inference/test_trt_pad_op.py |   3 +-
 .../ir/inference/test_trt_pool3d_op.py        |   6 +-
 .../ir/inference/test_trt_pool_op.py          |   3 +-
 .../ir/inference/test_trt_reduce_sum_op.py    |   5 +-
 .../ir/inference/test_trt_reshape_op.py       |   9 +-
 .../ir/inference/test_trt_scale_op.py         |   5 +-
 .../test_trt_shuffle_channel_detect_pass.py   |   4 +-
 .../ir/inference/test_trt_slice_plugin.py     |   7 +-
 .../ir/inference/test_trt_subgraph_pass.py    |  15 +-
 .../ir/inference/test_trt_tile_op.py          |   8 +-
 ..._trt_transpose_flatten_concat_fuse_pass.py |   2 +-
 .../unittests/mlu/sync_batch_norm_op_mlu.py   |   2 +-
 .../unittests/mlu/test_batch_norm_op_mlu.py   |   4 +-
 .../unittests/npu/sync_batch_norm_op_npu.py   |   2 +-
 .../fluid/tests/unittests/seresnext_net.py    |   2 +-
 .../fluid/tests/unittests/simple_nets.py      |   2 +-
 .../test_async_ssa_graph_executor_mnist.py    |   2 +-
 .../tests/unittests/test_batch_norm_op.py     |   4 +-
 .../tests/unittests/test_fetch_unmerged.py    |   2 +-
 .../tests/unittests/test_fuse_bn_act_pass.py  |   4 +-
 .../unittests/test_fuse_bn_add_act_pass.py    |   6 +-
 .../test_fuse_relu_depthwise_conv_pass.py     |   2 +-
 .../test_image_classification_layer.py        |   4 +-
 .../test_imperative_load_static_param.py      |   4 +-
 .../test_imperative_static_runner_mnist.py    |   2 +-
 .../tests/unittests/test_inplace_abn_op.py    |   2 +-
 .../tests/unittests/test_ir_inplace_pass.py   |   2 +-
 .../fluid/tests/unittests/test_layers.py      |   4 +-
 .../test_load_state_dict_from_old_format.py   |   2 +-
 .../test_mix_precision_all_reduce_fuse.py     |   2 +-
 .../tests/unittests/test_norm_nn_grad.py      |   4 +-
 .../test_parallel_executor_fetch_feed.py      |   4 +-
 .../unittests/test_parallel_executor_mnist.py |   2 +-
 .../tests/unittests/test_set_bool_attr.py     |   4 +-
 .../unittests/test_sync_batch_norm_op.py      |   2 +-
 python/paddle/static/__init__.py              |   2 +
 python/paddle/static/nn/__init__.py           |   3 +-
 python/paddle/static/nn/common.py             | 322 +++++++++++++++++
 66 files changed, 450 insertions(+), 428 deletions(-)

diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 02c5a7bfe4f87..fffc3cd5a6e3f 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -1963,8 +1963,11 @@ def fused_bn_add_act(
     Examples:
             .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
+            # required: gpu
             def build_program(main_program, startup_program):
                 with fluid.program_guard(main_program, startup_program):
                     x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
@@ -1987,7 +1990,7 @@ def build_program(main_program, startup_program):
                         act=None,
                         bias_attr=False,
                         data_format='NHWC')
-                    bn = fluid.layers.batch_norm(
+                    bn = paddle.static.nn.batch_norm(
                         input=conv1_1,
                         act=None,
                         data_layout='NHWC')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 482c7237bfce8..1b692bcaafb0e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -37,7 +37,7 @@ def conv_block():
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
index 23b89512454a7..a89042c0b5959 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
@@ -37,7 +37,7 @@ def conv_net(img, label):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 7fa95fd13f494..f49d019bc1752 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -57,7 +57,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     data = fluid.layers.data(
         name='image',
@@ -102,7 +102,7 @@ def conv_net(img, label, quant_skip_pattern):
         pool_type='max',
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
@@ -712,7 +712,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     data1 = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
     data2 = fluid.layers.data(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index 46e3700246037..d19b62a376279 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -43,7 +43,7 @@ def conv_net(img, label):
         pool_type='max',
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index 25656278137a7..fc5d18227b92a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -45,7 +45,7 @@ def conv_net(img, label):
         pool_type='max',
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 908622d76a154..b5df94c0cb497 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -41,7 +41,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
@@ -97,7 +97,7 @@ def conv_block(input, num_filter, groups, dropouts):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = paddle.static.nn.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
     return fc2
diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
index 8f4bf36e5b2b5..b3d12bf9a4ba3 100644
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
@@ -59,7 +59,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index cdbd65fad68a6..b2f166def0798 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -48,7 +48,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -72,7 +72,7 @@ def conv_net(img, label):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 474bccc162e2b..9d4429ef04685 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -69,7 +69,6 @@
     'crf_decoding',
     'conv2d',
     'pool2d',
-    'batch_norm',
     'dropout',
     'split',
     'l2_normalize',
@@ -1681,328 +1680,6 @@ def is_list_or_tuple(ele):
     return pool_out
 
 
-def batch_norm(
-    input,
-    act=None,
-    is_test=False,
-    momentum=0.9,
-    epsilon=1e-05,
-    param_attr=None,
-    bias_attr=None,
-    data_layout='NCHW',
-    in_place=False,
-    name=None,
-    moving_mean_name=None,
-    moving_variance_name=None,
-    do_model_average_for_mean_and_var=True,
-    use_global_stats=False,
-):
-    r"""
-    :api_attr: Static Graph
-
-    **Batch Normalization Layer**
-
-    Can be used as a normalizer function for convolution or fully_connected operations.
-    The required data format for this layer is one of the following:
-
-    1. NHWC `[batch, in_height, in_width, in_channels]`
-
-    2. NCHW `[batch, in_channels, in_height, in_width]`
-
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    :math:`input` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-        moving\_mean = moving\_mean * momentum + mini-batch\_mean * (1. - momentum) \\\\
-        moving\_var = moving\_var * momentum + mini-batch\_var * (1. - momentum)
-
-
-    moving_mean is global mean and moving_var is global variance.
-
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global (or running) statistics. (It usually got from the
-    pre-trained model.)
-    The training and testing (or inference) have the same behavior:
-
-    ..  math::
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta
-
-    Note:
-        if build_strategy.sync_batch_norm=True, the batch_norm in network will use
-        sync_batch_norm automatically.
-        `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`.
-
-    Args:
-        input(Tensor): The rank of input Tensor can be 2, 3, 4, 5. The data type
-            is float16 or float32 or float64.
-        act(string, Default None): Activation type, linear|relu|prelu|...
-        is_test (bool, Default False): A flag indicating whether it is in
-            test phrase or not.
-        momentum(float|Tensor, Default 0.9): The value used for the moving_mean and
-            moving_var computation. This should be a float number or a Tensor with
-            shape [1] and data type as float32. The updated formula is:
-            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
-            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
-            Default is 0.9.
-        epsilon(float, Default 1e-05): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized
-	     with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
-	     Default: None.
-        data_layout (str, optional): Specify the data format of the input, and the data format of the output
-             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-             `[batch_size, input_channels, input_height, input_width]`.
-        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default.
-        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it
-            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
-            will save global mean with the string.
-        moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance.
-            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
-            will save global variance with the string.
-        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model
-            average when model average is enabled.
-        use_global_stats(bool, Default False): Whether to use global mean and
-            variance. In inference or test mode, set use_global_stats to true
-            or is_test to true, and the behavior is equivalent.
-            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period.
-    Returns:
-        A Tensor which is the result after applying batch normalization on the input,
-        has same shape and data type with input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-            paddle.enable_static()
-            x = paddle.static.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
-            hidden1 = paddle.static.nn.fc(x=x, size=200)
-            print(hidden1.shape)
-            # [3, 200]
-            hidden2 = paddle.static.nn.batch_norm(input=hidden1)
-            print(hidden2.shape)
-            # [3, 200]
-    """
-    assert (
-        bias_attr is not False
-    ), "bias_attr should not be False in batch_norm."
-    helper = LayerHelper('batch_norm', **locals())
-
-    check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64'], 'batch_norm'
-    )
-    dtype = helper.input_dtype()
-
-    # use fp32 for bn parameter
-    if dtype == core.VarDesc.VarType.FP16:
-        dtype = core.VarDesc.VarType.FP32
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0),
-    )
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
-    )
-
-    mean = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_mean_name,
-            initializer=Constant(0.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var,
-        ),
-        shape=param_shape,
-        dtype=dtype,
-    )
-    mean.stop_gradient = True
-
-    variance = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_variance_name,
-            initializer=Constant(1.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var,
-        ),
-        shape=param_shape,
-        dtype=dtype,
-    )
-    variance.stop_gradient = True
-
-    # create output
-    # mean and mean_out share the same memory
-    mean_out = mean
-    # variance and variance_out share the same memory
-    variance_out = variance
-
-    if in_dygraph_mode():
-        inputs_has_MomemtumTensor = False
-        attrs_has_momentum = False
-        tmp_tensor_type = core.eager.Tensor
-        if isinstance(momentum, tmp_tensor_type):
-            inputs_has_MomemtumTensor = True
-        else:
-            attrs_has_momentum = True
-
-        attrs_ = ()
-        if attrs_has_momentum:
-            attrs_ = (
-                'momentum',
-                momentum,
-                'epsilon',
-                epsilon,
-                'is_test',
-                is_test,
-                'data_layout',
-                data_layout,
-                'use_mkldnn',
-                False,
-                'fuse_with_relu',
-                False,
-                'use_global_stats',
-                use_global_stats,
-            )
-        else:
-            attrs_ = (
-                'epsilon',
-                epsilon,
-                'is_test',
-                is_test,
-                'data_layout',
-                data_layout,
-                'use_mkldnn',
-                False,
-                'fuse_with_relu',
-                False,
-                'use_global_stats',
-                use_global_stats,
-            )
-        if inputs_has_MomemtumTensor:
-            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input,
-                scale,
-                bias,
-                mean,
-                variance,
-                momentum,
-                mean_out,
-                variance_out,
-                *attrs_,
-            )
-        else:
-            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input,
-                scale,
-                bias,
-                mean,
-                variance,
-                None,
-                mean_out,
-                variance_out,
-                *attrs_,
-            )
-
-        return dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=act, use_mkldnn=False
-        )
-
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-    saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-    reserve_space = None
-    if not is_test:
-        reserve_space = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype(), stop_gradient=True
-        )
-
-    batch_norm_out = (
-        input if in_place else helper.create_variable_for_type_inference(dtype)
-    )
-
-    inputs = {
-        "X": input,
-        "Scale": scale,
-        "Bias": bias,
-        "Mean": mean,
-        "Variance": variance,
-        "MeanOut": mean_out,
-        "VarianceOut": variance_out,
-    }
-    attrs = {
-        "epsilon": epsilon,
-        "is_test": is_test,
-        "data_layout": data_layout,
-        "use_mkldnn": False,
-        "fuse_with_relu": False,
-        "use_global_stats": use_global_stats,
-    }
-    if isinstance(momentum, Variable):
-        inputs['MomemtumTensor'] = momentum
-    else:
-        attrs['momentum'] = momentum
-
-    outputs = {
-        "Y": batch_norm_out,
-        "MeanOut": mean_out,
-        "VarianceOut": variance_out,
-        "SavedMean": saved_mean,
-        "SavedVariance": saved_variance,
-    }
-    if reserve_space is not None:
-        outputs["ReserveSpace"] = reserve_space
-
-    helper.append_op(
-        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
-    )
-
-    return helper.append_activation(batch_norm_out)
-
-
 @templatedoc()
 def layer_norm(
     input,
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 0a781e67a82fc..2e8c83be2423f 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -253,7 +253,7 @@ def __extend_list__(obj):
         )
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act)
+            tmp = paddle.static.nn.batch_norm(input=tmp, act=conv_act)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 3a401df20370d..77a59bc037037 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -40,7 +40,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
@@ -96,7 +96,7 @@ def conv_block(input, num_filter, groups, dropouts):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = paddle.static.nn.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
     return fc2
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index a89cb1617a12a..b96ff9940985b 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -51,7 +51,7 @@ def conv_net(img, label):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index 05b3f3b093a65..8753d660beb16 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -182,7 +182,7 @@ def conv_bn_layer(
             ),
             bias_attr=False,
         )
-        return fluid.layers.batch_norm(input=conv, act=act)
+        return paddle.static.nn.batch_norm(input=conv, act=act)
 
     def squeeze_excitation(self, input, num_channels, reduction_ratio):
         pool = fluid.layers.pool2d(
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 11275d0227488..ac418c2531904 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -58,7 +58,7 @@ def build_model(self):
         x = paddle.static.nn.conv2d(
             x, num_filters=3, filter_size=3, bias_attr=False
         )
-        x = paddle.fluid.layers.batch_norm(x, **self.attrs)
+        x = paddle.static.nn.batch_norm(x, **self.attrs)
         self.fetch_list = [x.name]
 
     def run_model(self, exec_mode):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index 29393ff96ca2b..abc96d262e04e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -22,6 +22,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -39,7 +40,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 32, 32], dtype="float32"
             )
             act_out = self.append_act(data)
-            out = fluid.layers.batch_norm(act_out, is_test=True)
+            out = nn.batch_norm(act_out, is_test=True)
         self.feeds = {
             "data": np.random.random([1, 6, 32, 32]).astype("float32"),
         }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
index a794298130866..88743ef399740 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -67,7 +68,7 @@ def build(self):
             )
             if self.dynamic_shape_params is not None:
                 anchor = paddle.transpose(anchor, [2, 3, 0, 1])
-            out = fluid.layers.batch_norm(anchor, is_test=True)
+            out = nn.batch_norm(anchor, is_test=True)
 
         self.fetch_list = [out, var]
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
index ed7aa546b345d..df31be07eb8c8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
@@ -21,6 +21,7 @@
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -34,7 +35,7 @@ def setUp(self):
                 name="data2", shape=[-1, 3, 64, 1], dtype="float32"
             )
             eltwise_out = self.append_eltwise(data1, data2)
-            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+            out = nn.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([1, 3, 64, 1]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
index 4ed648ed9c806..eec26fefec2d1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -30,7 +31,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 64, 64], dtype="float32"
             )
             flatten_out = self.append_flatten(data)
-            out = fluid.layers.batch_norm(flatten_out, is_test=True)
+            out = nn.batch_norm(flatten_out, is_test=True)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
         }
@@ -59,7 +60,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 64, 64], dtype="float32"
             )
             flatten_out = self.append_flatten(data)
-            out = fluid.layers.batch_norm(flatten_out, is_test=True)
+            out = nn.batch_norm(flatten_out, is_test=True)
         self.feeds = {
             "data": np.random.random([2, 6, 64, 64]).astype("float32"),
         }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
index b96eddb87e779..161a3142d5210 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -29,7 +30,7 @@ def setUp(self):
             data = fluid.data(name="data", shape=[-1, 3, 4], dtype="float32")
             index = fluid.data(name="index", shape=[-1, 2, 2], dtype="int32")
             gather_nd = paddle.gather_nd(data, index)
-            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+            out = nn.batch_norm(gather_nd, is_test=True)
 
         self.feeds = {
             "data": np.random.random([2, 3, 4]).astype("float32"),
@@ -66,7 +67,7 @@ def setUp(self):
             )
             index = fluid.data(name="index", shape=[-1, 1028, 2], dtype="int32")
             gather_nd = paddle.gather_nd(data, index)
-            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+            out = nn.batch_norm(gather_nd, is_test=True)
 
         index_data = np.zeros((1, 1028, 2), dtype='int32')
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
index 9c8e1ee04cc38..379c3872242f0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
@@ -37,7 +37,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = paddle.static.nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data": np.ones([1, 16, 16]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
index 2901238ffe4a4..4d98c8cb3f382 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
@@ -20,9 +20,9 @@
 import numpy as np
 from inference_pass_test import InferencePassTest
 
-import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -44,8 +44,8 @@ def build(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             shape = [-1, self.channel, self.height, self.width]
             data = fluid.data(name='in', shape=shape, dtype='float32')
-            instance_norm_out = paddle.static.nn.instance_norm(data)
-            out = fluid.layers.batch_norm(instance_norm_out, is_test=True)
+            instance_norm_out = nn.instance_norm(data)
+            out = nn.batch_norm(instance_norm_out, is_test=True)
 
         shape[0] = self.bs
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 038912fbe4cb1..0d10acae95c3f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -35,7 +36,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data": np.ones([24, 24]).astype("float32"),
@@ -74,7 +75,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data": np.ones([1, 6, 24, 24]).astype("float32"),
@@ -136,7 +137,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data_x": np.ones([2, 6, 24]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
index b8566840d2131..4e2b3e0ae2420 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
@@ -135,7 +135,7 @@ def network():
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = paddle.static.nn.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(
                 input=matmul_out,
                 size=10,
@@ -231,7 +231,7 @@ def network():
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = paddle.static.nn.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(
                 input=matmul_out,
                 size=10,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
index 00a980415e1a7..2f2908d5f3198 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
@@ -21,6 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
@@ -242,7 +243,7 @@ def build(self):
                 [self.bs, 1, self.keep_top_k, 6],
                 name='reshape',
             )
-            out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True)
+            out = nn.batch_norm(multiclass_nms_out, is_test=True)
 
         boxes_data = (
             np.arange(self.num_boxes * 4)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
index 68ec0c22703c0..b5f84dcc9f760 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
@@ -21,6 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -69,7 +70,7 @@ def build(self):
                 [self.bs, 1, self.keep_top_k, 6],
                 name='reshape',
             )
-            out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True)
+            out = nn.batch_norm(multiclass_nms_out, is_test=True)
 
         boxes_data = (
             np.arange(self.num_boxes * 4)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
index 505a7ccad3bc2..f335bd8f82399 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -44,7 +45,7 @@ def setUp(self):
                 ]
             data = fluid.data(name='data', shape=shape, dtype='float32')
             resize_out = self.append_nearest_interp(data)
-            out = fluid.layers.batch_norm(resize_out, is_test=True)
+            out = nn.batch_norm(resize_out, is_test=True)
 
         if self.data_layout == 'NCHW':
             shape = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
index a4c7dba0e6eba..056e5b6e29212 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
@@ -19,6 +19,7 @@
 
 import paddle.fluid.core as core
 import paddle.nn.functional as F
+import paddle.static.nn as nn
 from paddle import fluid
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
@@ -44,7 +45,7 @@ def setUp(self):
                 ]
             data = fluid.data(name='data', shape=shape, dtype='float32')
             resize_out = self.append_nearest_interp(data)
-            out = fluid.layers.batch_norm(resize_out, is_test=True)
+            out = nn.batch_norm(resize_out, is_test=True)
 
         if self.data_layout == 'NCHW':
             shape = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
index f0cf6ead9d380..4b7dc7c9cb689 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig
 
 
@@ -32,7 +33,7 @@ def setUp(self):
             pad_out = paddle.nn.functional.pad(
                 x=data, pad=[0, 0, 0, 0, 0, 1, 1, 2], value=0.0
             )
-            out = fluid.layers.batch_norm(pad_out, is_test=True)
+            out = nn.batch_norm(pad_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((1, 3, 128, 128)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
index 0362d96fc2a91..f8abf50dd10ff 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
@@ -80,7 +80,7 @@ def build_network(self):
                     ceil_mode=self.ceil_mode,
                     exclusive=self.exclusive,
                 )
-            # out = fluid.layers.batch_norm(pool_out, is_test=True)
+            # out = paddle.static.nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [pool_out]
 
     def check_output(self):
@@ -198,7 +198,7 @@ def build_network(self):
             pool_out = paddle.nn.functional.adaptive_avg_pool3d(
                 x=data, output_size=[3, 3, 3]
             )
-            # out = fluid.layers.batch_norm(pool_out, is_test=True)
+            # out = paddle.static.nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [pool_out]
 
     def check_output(self):
@@ -298,7 +298,7 @@ def build_network(self):
             pool_out = paddle.nn.functional.adaptive_max_pool3d(
                 x=data, output_size=[3, 3, 3]
             )
-            # out = fluid.layers.batch_norm(pool_out, is_test=True)
+            # out = paddle.static.nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [pool_out]
 
     def check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
index b8f3ced692134..c916109803630 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
@@ -22,6 +22,7 @@
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -72,7 +73,7 @@ def build_network(self):
                 ceil_mode=self.ceil_mode,
                 exclusive=self.exclusive,
             )
-            out = fluid.layers.batch_norm(pool_out, is_test=True)
+            out = nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [out]
 
     def check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
index 79aa8cf14fa24..cd66cb1e914b8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -30,7 +31,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 10, 192], dtype="float32"
             )
             reduce_sum = paddle.sum(data, axis=[2, -1], keepdim=True)
-            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+            out = nn.batch_norm(reduce_sum, is_test=True)
 
         self.feeds = {
             "data": np.random.random([3, 3, 10, 192]).astype("float32"),
@@ -63,7 +64,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 10, 192], dtype="float32"
             )
             reduce_sum = paddle.sum(data, keepdim=True)
-            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+            out = nn.batch_norm(reduce_sum, is_test=True)
 
         self.feeds = {
             "data": np.random.random([3, 3, 10, 192]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
index ffbe80387719a..8edd7cafcbe4d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -39,7 +40,7 @@ def setUp(self):
                 name='data', shape=self.data_shape, dtype='float32'
             )
             reshape_out = self.append_reshape(data, self.reshape)
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = nn.batch_norm(reshape_out, is_test=True)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32'),
         }
@@ -77,7 +78,7 @@ def setUp(self):
                 name='data', shape=self.data_shape, dtype='float32'
             )
             reshape_out = self.append_reshape(data, self.reshape)
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = nn.batch_norm(reshape_out, is_test=True)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32'),
         }
@@ -104,7 +105,7 @@ def setUp(self):
                 name='data', shape=self.data_shape, dtype='float32'
             )
             reshape_out = paddle.reshape(x=data, shape=self.reshape)
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = nn.batch_norm(reshape_out, is_test=True)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32')
         }
@@ -130,7 +131,7 @@ def setUp(self):
             data = fluid.data(
                 name='data', shape=self.data_shape, dtype='float32'
             )
-            bn_out = fluid.layers.batch_norm(data, is_test=True)
+            bn_out = nn.batch_norm(data, is_test=True)
             out = self.append_reshape(bn_out, self.reshape)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32'),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
index 0ffabd0178141..3bca0dbf18482 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -28,7 +29,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[-1, 512], dtype="float32")
             scale_out = self.append_scale(data)
-            out = fluid.layers.batch_norm(scale_out, is_test=True)
+            out = nn.batch_norm(scale_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 512]).astype("float32"),
@@ -60,7 +61,7 @@ def setUp(self):
                 name="data", shape=[-1, 512, 512], dtype="float32"
             )
             scale_out = self.append_scale(data)
-            out = fluid.layers.batch_norm(scale_out, is_test=True)
+            out = nn.batch_norm(scale_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 512, 512]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
index ad0f2a66489c8..fc3b066556d6e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
@@ -19,6 +19,7 @@
 
 import paddle
 import paddle.fluid as fluid
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -31,8 +32,7 @@ def setUp(self):
             reshape1 = paddle.reshape(x=data, shape=[-1, 2, 3, 64, 64])
             trans = paddle.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
             reshape2 = paddle.reshape(x=trans, shape=[-1, 6, 64, 64])
-
-            out = fluid.layers.batch_norm(reshape2, is_test=True)
+            out = nn.batch_norm(reshape2, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
index b8b0e6a55033a..355c0c9a00e65 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig
 
 
@@ -45,7 +46,7 @@ def setUp(self):
             starts = self.params_starts
             ends = self.params_ends
             slice_out = paddle.slice(data, axes=axes, starts=starts, ends=ends)
-            out = fluid.layers.batch_norm(slice_out, is_test=True)
+            out = nn.batch_norm(slice_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((3, 3, 3, 3)).astype("float32"),
@@ -115,7 +116,7 @@ def setUp(self):
             ends = self.params_ends
             slice_out = paddle.slice(data, axes=axes, starts=starts, ends=ends)
             cast_out = fluid.layers.cast(slice_out, 'float32')
-            out = fluid.layers.batch_norm(cast_out, is_test=True)
+            out = nn.batch_norm(cast_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((3, 3, 3, 3)).astype("int32"),
@@ -140,7 +141,7 @@ def setUp(self):
             ends = self.params_ends
             slice_out = paddle.slice(data, axes=axes, starts=starts, ends=ends)
             cast_out = fluid.layers.cast(slice_out, 'float32')
-            out = fluid.layers.batch_norm(cast_out, is_test=True)
+            out = nn.batch_norm(cast_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((3, 3, 3, 3)).astype("int32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index a0f034462f3ba..c864cc91c3c33 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -22,6 +22,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -62,7 +63,7 @@ def setUp(self):
                 name="data2", shape=[-1, 3, 64, 64], dtype="float32"
             )
             concat_out = fluid.layers.concat([data1, data2], axis=2)
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
+            out = nn.batch_norm(concat_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([1, 3, 64, 64]).astype("float32"),
@@ -89,7 +90,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 64, 64], dtype="float32"
             )
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
-            out = fluid.layers.batch_norm(split_out[0], is_test=True)
+            out = nn.batch_norm(split_out[0], is_test=True)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -115,7 +116,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 64, 64], dtype="float32"
             )
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
-            out = fluid.layers.batch_norm(split_out[0], is_test=True)
+            out = nn.batch_norm(split_out[0], is_test=True)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -143,7 +144,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 64, 64], dtype="float32"
             )
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
-            out = fluid.layers.batch_norm(split_out[0], is_test=True)
+            out = nn.batch_norm(split_out[0], is_test=True)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -216,7 +217,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 64, 64], dtype="float32"
             )
             transpose_out = self.append_transpose(data)
-            out = fluid.layers.batch_norm(transpose_out, is_test=True)
+            out = nn.batch_norm(transpose_out, is_test=True)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
         }
@@ -366,7 +367,7 @@ def setUp(self):
                 name="data2", shape=[-1, 3, 64, 64], dtype="float32"
             )
             eltwise_out = self.append_eltwise(data1, data2)
-            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+            out = nn.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([1, 3, 64, 64]).astype("float32"),
@@ -419,7 +420,7 @@ def setUp(self):
             )
             data2 = fluid.data(name="data2", shape=[64, 64], dtype="float32")
             eltwise_out = self.append_eltwise(data1, data2)
-            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+            out = nn.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([64, 64]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
index 45fa629f6cd30..9557f8c71c904 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
@@ -30,7 +30,7 @@ def setUp(self):
                 name="data", shape=[4, 3, 224, 256], dtype="float32"
             )
             tile_out = paddle.tile(x=data, repeat_times=[1, 1, 1, 1])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([4, 3, 224, 256]).astype("float32"),
@@ -55,7 +55,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
             tile_out = paddle.tile(x=data, repeat_times=[1, 4, 1080, 1920])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 1, 1, 1]).astype("float32"),
@@ -80,7 +80,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
             tile_out = paddle.tile(x=data, repeat_times=[1, 4, 1080, 1920])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 1, 1, 1]).astype("float32"),
@@ -105,7 +105,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
             tile_out = paddle.tile(x=data, repeat_times=[1, 4, 1080, 1920])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 1, 1, 1]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
index 192274ef34106..ff464a0e1e058 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
@@ -42,7 +42,7 @@ def setUp(self):
             # There is no parameters for above structure.
             # Hence, append a batch_norm to avoid failure caused by load_combined.
             reshape_out = paddle.reshape(concat_out, [-1, 0, 1, 1])
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = paddle.static.nn.batch_norm(reshape_out, is_test=True)
 
         self.feeds = {
             "data1": np.random.random([8, 32, 128]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
index a528731001711..6412c4b5f5a93 100644
--- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -86,7 +86,7 @@ def get_model(
                 )
                 if self.bn_dtype == np.float16:
                     conv = fluid.layers.cast(conv, 'float16')
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     conv,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 53b78e18f8861..29be16759e9c2 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -742,12 +742,12 @@ def test_errors(self):
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
             )
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x1)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x1)
 
             # the input dtype of batch_norm must be float16 or float32 or float64
             # float16 only can be set on GPU place
             x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x2)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x2)
 
 
 class TestDygraphBatchNormAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
index 2fd353af1a2dc..49d4f92bdf983 100644
--- a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
@@ -88,7 +88,7 @@ def get_model(
                     bias_attr=False,
                     use_cudnn=use_cudnn,
                 )
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     conv,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 146cd58bcf438..15b47d427395c 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -80,7 +80,7 @@ def conv_bn_layer(
     return (
         conv
         if remove_bn
-        else fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+        else paddle.static.nn.batch_norm(input=conv, act=act, momentum=0.1)
     )
 
 
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
index 4f90fe3cc0966..9d124ee509200 100644
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -53,7 +53,7 @@ def batchnorm_fc_with_inputs(img, label, class_num=10):
             ),
         )
 
-        hidden = fluid.layers.batch_norm(input=hidden)
+        hidden = paddle.static.nn.batch_norm(input=hidden)
 
     prediction = fluid.layers.fc(hidden, size=class_num, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 26eb0a628ab9a..54e74ade09aef 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -46,7 +46,7 @@ def convolutional_neural_network(use_py_reader):
             pool_stride=2,
             act="relu",
         )
-        conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+        conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
         conv_pool_2 = fluid.nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 34b358130219d..079628658addb 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -759,12 +759,12 @@ def test_errors(self):
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
             )
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x1)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x1)
 
             # the input dtype of batch_norm must be float16 or float32 or float64
             # float16 only can be set on GPU place
             x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x2)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x2)
 
 
 class TestDygraphBatchNormAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
index 978298f8f859d..028954d22ffdc 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
@@ -34,7 +34,7 @@ def conv_net(self, img, label):
             pool_type='max',
             act="relu",
         )
-        conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+        conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
         conv_pool_2 = fluid.nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index a04e845db0af4..9a7a907321089 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -41,7 +41,7 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1):
                 name='batch_norm_b',
                 initializer=fluid.initializer.Constant(value=0.0),
             )
-            hidden2 = fluid.layers.batch_norm(
+            hidden2 = paddle.static.nn.batch_norm(
                 input=hidden1,
                 param_attr=param_attr,
                 bias_attr=bias_attr,
@@ -49,7 +49,7 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1):
                 data_layout='NHWC',
             )
             hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu')
-            hidden4 = fluid.layers.batch_norm(
+            hidden4 = paddle.static.nn.batch_norm(
                 input=hidden3, act='relu', data_layout='NHWC'
             )
             prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index 68d11d0897279..1b83dfa2b010d 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -87,7 +87,7 @@ def build_fused_program(
                 bias_attr=False,
                 data_format='NHWC',
             )
-            bn = fluid.layers.batch_norm(
+            bn = paddle.static.nn.batch_norm(
                 input=conv1_1,
                 param_attr=self.bn_param_attr1,
                 bias_attr=self.bn_bias_attr1,
@@ -133,7 +133,7 @@ def build_origin_program(
                 bias_attr=False,
                 data_format='NHWC',
             )
-            bn1 = fluid.layers.batch_norm(
+            bn1 = paddle.static.nn.batch_norm(
                 input=conv1_1,
                 param_attr=self.bn_param_attr1,
                 bias_attr=self.bn_bias_attr1,
@@ -150,7 +150,7 @@ def build_origin_program(
                 bias_attr=False,
                 data_format='NHWC',
             )
-            bn2 = fluid.layers.batch_norm(
+            bn2 = paddle.static.nn.batch_norm(
                 input=conv1_1,
                 param_attr=self.bn_param_attr2,
                 bias_attr=self.bn_bias_attr2,
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index a136a623d0331..025e12c02c611 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -23,7 +23,7 @@
 
 
 def norm(*args, **kargs):
-    return fluid.layers.batch_norm(*args, **kargs)
+    return paddle.static.nn.batch_norm(*args, **kargs)
 
 
 def sep_conv(input, channel, stride, filter, dilation=1, act=None):
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 7a61eaaa04937..2d39fb4ab70c4 100644
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -42,9 +42,9 @@ def test_batch_norm_layer(self):
             images = fluid.layers.data(
                 name='pixel', shape=[3, 48, 48], dtype='float32'
             )
-            hidden1 = fluid.layers.batch_norm(input=images)
+            hidden1 = paddle.static.nn.batch_norm(input=images)
             hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
-            fluid.layers.batch_norm(input=hidden2)
+            paddle.static.nn.batch_norm(input=hidden2)
 
         print(str(main_program))
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index 573c1699acd9e..05c7542792cc6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -55,8 +55,8 @@ def testLoadStaticModel(self):
         batchnorm_in = fluid.data(
             name="batchnorm_in", shape=[None, 10], dtype='float32'
         )
-        batchnorm_out_1 = fluid.layers.batch_norm(batchnorm_in)
-        batchnorm_out_2 = fluid.layers.batch_norm(batchnorm_in)
+        batchnorm_out_1 = paddle.static.nn.batch_norm(batchnorm_in)
+        batchnorm_out_2 = paddle.static.nn.batch_norm(batchnorm_in)
 
         emb_in = fluid.data(name='emb_in', shape=[None, 10], dtype='int64')
         emb_out_1 = fluid.embedding(emb_in, [1000, 100])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index d129a9270ab5d..ee2cc13d6a8c0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -33,7 +33,7 @@ def convolutional_neural_network(img):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index fd9d7a26b1abc..299d3218cfdba 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -56,7 +56,7 @@ def build_program(
                     stop_gradient=False,
                 )
 
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     data,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index b7a0ab0d45042..3c5f2edc4f53b 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -38,7 +38,7 @@ def fc_with_batchnorm(use_feed):
             ),
         )
 
-        hidden = fluid.layers.batch_norm(input=hidden)
+        hidden = paddle.static.nn.batch_norm(input=hidden)
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     loss = paddle.mean(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 043321bf566cc..99fb5fac4ae6b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2863,7 +2863,7 @@ def make_batch_norm(self):
             data = self._get_data(
                 name='data', shape=[32, 128, 128], dtype="float32"
             )
-            out = layers.batch_norm(data)
+            out = paddle.static.nn.batch_norm(data)
             return out
 
     def make_batch_norm_momentum_variable(self):
@@ -2879,7 +2879,7 @@ def make_batch_norm_momentum_variable(self):
                 dtype='float32',
                 append_batch_size=False,
             )
-            out = layers.batch_norm(data, momentum=momentum)
+            out = paddle.static.nn.batch_norm(data, momentum=momentum)
             return out
 
     def make_range(self):
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index 98518f52f669a..db4af74fc35bb 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -33,7 +33,7 @@ def convolutional_neural_network(img):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
index 9d42b68e144ba..e024917a30682 100644
--- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
@@ -45,7 +45,7 @@ def conv_net(use_feed):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
 
     conv_pool_1 = fluid.layers.cast(conv_pool_1, np.float32)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index bbcb5ef7b9b85..72efd20c6d116 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -171,7 +171,7 @@ def func(self, place):
             eps = 0.005
             atol = 1e-4
             x = paddle.create_parameter(dtype=dtype, shape=self.shape, name='x')
-            z = fluid.layers.batch_norm(
+            z = paddle.static.nn.batch_norm(
                 input=x,
                 data_layout=self.data_layout,
                 use_global_stats=self.use_global_stats,
@@ -251,7 +251,7 @@ def func(self, place):
                 self.shape[1] if self.data_layout == 'NCHW' else self.shape[-1]
             )
             x = paddle.create_parameter(dtype=dtype, shape=self.shape, name='x')
-            z = fluid.layers.batch_norm(
+            z = paddle.static.nn.batch_norm(
                 input=x,
                 data_layout=self.data_layout,
                 use_global_stats=self.use_global_stats,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index f00595b1145e7..f8f65b63b8003 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -26,10 +26,10 @@
 
 def Lenet(data, class_dim):
     conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None)
-    bn1 = fluid.layers.batch_norm(conv1, act='relu')
+    bn1 = paddle.static.nn.batch_norm(conv1, act='relu')
     pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
     conv2 = fluid.layers.conv2d(pool1, 16, 5, 1, act=None)
-    bn2 = fluid.layers.batch_norm(conv2, act='relu')
+    bn2 = paddle.static.nn.batch_norm(conv2, act='relu')
     pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
 
     fc1 = fluid.layers.fc(pool2, size=50, act='relu')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 4dc0020b91fd4..e86a09e898ec1 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -58,7 +58,7 @@ def fc_with_batchnorm(use_feed):
                 ),
             )
 
-            hidden = fluid.layers.batch_norm(input=hidden)
+            hidden = paddle.static.nn.batch_norm(input=hidden)
     with fluid.name_scope("fc_layer"):
         prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     with fluid.name_scope("loss"):
diff --git a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
index 0a43e57e903eb..da02e4621d0c2 100644
--- a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
+++ b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 
 
@@ -28,7 +30,7 @@ def test_set_bool_attr(self):
             name='batch_norm_b',
             initializer=fluid.initializer.Constant(value=0.0),
         )
-        bn = fluid.layers.batch_norm(
+        bn = paddle.static.nn.batch_norm(
             input=x, param_attr=param_attr, bias_attr=bias_attr
         )
         block = fluid.default_main_program().desc.block(0)
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index ef42ab8a52259..10755fb729b2d 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -81,7 +81,7 @@ def _build_program(
                     bias_attr=False,
                     use_cudnn=use_cudnn,
                 )
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     conv,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 983138ce976c2..e638ca5531721 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -74,11 +74,13 @@
 from ..fluid.contrib.layers import ctr_metric_bundle  # noqa: F401
 from ..fluid.layers import exponential_decay  # noqa: F401
 
+from .nn.common import batch_norm  # noqa: F401
 from paddle.static.nn.metric import auc  # noqa: F401
 from paddle.static.nn.metric import accuracy  # noqa: F401
 
 __all__ = [  # noqa
     'append_backward',
+    'batch_norm',
     'gradients',
     'Executor',
     'global_scope',
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 9635811f6a818..cae4b52fe4c59 100755
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .common import fc  # noqa: F401
+from .common import batch_norm  # noqa: F401
 from .common import instance_norm  # noqa: F401
 from .common import data_norm  # noqa: F401
 from .common import group_norm  # noqa: F401
@@ -22,9 +23,7 @@
 from .common import conv3d_transpose  # noqa: F401
 from .common import bilinear_tensor_product  # noqa: F401
 from .common import py_func  # noqa: F401
-
 from ...tensor.creation import create_parameter  # noqa: F401
-from ...fluid.layers import batch_norm  # noqa: F401
 from ...fluid.layers import case  # noqa: F401
 from ...fluid.layers import cond  # noqa: F401
 from ...fluid.layers import conv2d  # noqa: F401
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 420a00ddbdc51..f74e6aa605a36 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -2160,6 +2160,328 @@ def bilinear_tensor_product(
     return helper.append_activation(out)
 
 
+def batch_norm(
+    input,
+    act=None,
+    is_test=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    data_layout='NCHW',
+    in_place=False,
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    use_global_stats=False,
+):
+    r"""
+
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for convolution or fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    :math:input is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+        moving\_mean = moving\_mean * momentum + mini-batch\_mean * (1. - momentum) \\\\
+        moving\_var = moving\_var * momentum + mini-batch\_var * (1. - momentum)
+
+
+    moving_mean is global mean and moving_var is global variance.
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+
+    Note:
+        if build_strategy.sync_batch_norm=True, the batch_norm in network will use
+        sync_batch_norm automatically.
+        `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`.
+
+    Args:
+        input(Tensor): The rank of input Tensor can be 2, 3, 4, 5. The data type
+            is float16 or float32 or float64.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float|Tensor, Default 0.9): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a Tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+         will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+         If the Initializer of the param_attr is not set, the parameter is initialized
+         with Xavier. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+         will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+         If the Initializer of the bias_attr is not set, the bias is initialized zero.
+         Default: None.
+        data_layout (str, optional): Specify the data format of the input, and the data format of the output
+             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+             `[batch_size, input_channels, input_height, input_width]`.
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string.
+        moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string.
+        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model
+            average when model average is enabled.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
+
+    Returns:
+        A Tensor which is the result after applying batch normalization on the input,
+        has same shape and data type with input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            paddle.enable_static()
+            x = paddle.static.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
+            hidden1 = paddle.static.nn.fc(x=x, size=200)
+            print(hidden1.shape)
+            # [3, 200]
+            hidden2 = paddle.static.nn.batch_norm(input=hidden1)
+            print(hidden2.shape)
+            # [3, 200]
+    """
+    assert (
+        bias_attr is not False
+    ), "bias_attr should not be False in batch_norm."
+    helper = LayerHelper('batch_norm', **locals())
+
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'batch_norm'
+    )
+    dtype = helper.input_dtype()
+
+    # use fp32 for bn parameter
+    if dtype == core.VarDesc.VarType.FP16:
+        dtype = core.VarDesc.VarType.FP32
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=paddle.fluid.initializer.Constant(1.0),
+    )
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+    )
+
+    mean = helper.create_parameter(
+        attr=paddle.ParamAttr(
+            name=moving_mean_name,
+            initializer=paddle.fluid.initializer.Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
+    mean.stop_gradient = True
+
+    variance = helper.create_parameter(
+        attr=paddle.ParamAttr(
+            name=moving_variance_name,
+            initializer=paddle.fluid.initializer.Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance_out share the same memory
+    variance_out = variance
+
+    if _non_static_mode():
+        inputs_has_MomemtumTensor = False
+        attrs_has_momentum = False
+        tmp_tensor_type = core.eager.Tensor
+        if isinstance(momentum, tmp_tensor_type):
+            inputs_has_MomemtumTensor = True
+        else:
+            attrs_has_momentum = True
+
+        attrs_ = ()
+        if attrs_has_momentum:
+            attrs_ = (
+                'momentum',
+                momentum,
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
+        else:
+            attrs_ = (
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
+        if inputs_has_MomemtumTensor:
+            batch_norm_out, _, _, _, _, _ = paddle._legacy_C_ops.batch_norm(
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                momentum,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
+        else:
+            batch_norm_out, _, _, _, _, _ = paddle._legacy_C_ops.batch_norm(
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                None,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
+
+        return paddle.fluid.dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=act, use_mkldnn=False
+        )
+
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    reserve_space = None
+    if not is_test:
+        reserve_space = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype(), stop_gradient=True
+        )
+
+    batch_norm_out = (
+        input if in_place else helper.create_variable_for_type_inference(dtype)
+    )
+
+    inputs = {
+        "X": input,
+        "Scale": scale,
+        "Bias": bias,
+        "Mean": mean,
+        "Variance": variance,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+    }
+    attrs = {
+        "epsilon": epsilon,
+        "is_test": is_test,
+        "data_layout": data_layout,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+    if isinstance(momentum, paddle.static.Variable):
+        inputs['MomemtumTensor'] = momentum
+    else:
+        attrs['momentum'] = momentum
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance,
+    }
+    if reserve_space is not None:
+        outputs["ReserveSpace"] = reserve_space
+
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
+
+    return helper.append_activation(batch_norm_out)
+
+
 @static_only
 def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
     r"""