onnx · askhade · Apr 9, 2021 · Apr 6, 2021 · Apr 7, 2021 · Apr 7, 2021
diff --git a/docs/Changelog.md b/docs/Changelog.md
@@ -18648,6 +18648,11 @@ This version of the operator has been available since version 14 of the default
 
   current_mean = ReduceMean(X, axis=all_except_channel_index)
   current_var =  ReduceVar(X, axis=all_except_channel_index)
+
+  Notice that ReduceVar refers to the population variance, and it equals to
+  sum(sqrd(x_i - x_avg)) / N
+  where N is the population size (this formula does not use sample size N - 1).
+
   ```
 
   When training_mode=False:
@@ -18697,7 +18702,7 @@ This version of the operator has been available since version 14 of the default
 <dt><tt>running_mean</tt> (optional, non-differentiable) : T</dt>
 <dd>The running mean after the BatchNormalization operator.</dd>
 <dt><tt>running_var</tt> (optional, non-differentiable) : T</dt>
-<dd>The running variance after the BatchNormalization operator.</dd>
+<dd>The running variance after the BatchNormalization operator. This op uses the population size (N) for calculating variance, and not the sample size N-1.</dd>
 </dl>
 
 #### Type Constraints

diff --git a/docs/Operators.md b/docs/Operators.md
@@ -1878,6 +1878,11 @@ expect(node, inputs=[x], outputs=[y], name='test_averagepool_3d_default')
 
   current_mean = ReduceMean(X, axis=all_except_channel_index)
   current_var =  ReduceVar(X, axis=all_except_channel_index)
+
+  Notice that ReduceVar refers to the population variance, and it equals to
+  sum(sqrd(x_i - x_avg)) / N
+  where N is the population size (this formula does not use sample size N - 1).
+
   ```
 
   When training_mode=False:
@@ -1929,7 +1934,7 @@ Other versions of this operator: <a href="Changelog.md#BatchNormalization-1">1</
 <dt><tt>running_mean</tt> (optional, non-differentiable) : T</dt>
 <dd>The running mean after the BatchNormalization operator.</dd>
 <dt><tt>running_var</tt> (optional, non-differentiable) : T</dt>
-<dd>The running variance after the BatchNormalization operator.</dd>
+<dd>The running variance after the BatchNormalization operator. This op uses the population size (N) for calculating variance, and not the sample size N-1.</dd>
 </dl>
 
 #### Type Constraints
@@ -1946,12 +1951,12 @@ Other versions of this operator: <a href="Changelog.md#BatchNormalization-1">1</
 <summary>batchnormalization</summary>
 
 ```python
-# input size: (1, 2, 1, 3)
-x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
-s = np.array([1.0, 1.5]).astype(np.float32)
-bias = np.array([0, 1]).astype(np.float32)
-mean = np.array([0, 3]).astype(np.float32)
-var = np.array([1, 1.5]).astype(np.float32)
+# input size: (2, 3, 4, 5)
+x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+s = np.random.randn(3).astype(np.float32)
+bias = np.random.randn(3).astype(np.float32)
+mean = np.random.randn(3).astype(np.float32)
+var = np.random.rand(3).astype(np.float32)
 y = _batchnorm_test_mode(x, s, bias, mean, var).astype(np.float32)
 
 node = onnx.helper.make_node(
@@ -1960,7 +1965,7 @@ node = onnx.helper.make_node(
     outputs=['y'],
 )
 
-# output size: (1, 2, 1, 3)
+# output size: (2, 3, 4, 5)
 expect(node, inputs=[x, s, bias, mean, var], outputs=[y],
        name='test_batchnorm_example')
 
@@ -1992,12 +1997,12 @@ expect(node, inputs=[x, s, bias, mean, var], outputs=[y],
 <summary>train</summary>
 
 ```python
-# input size: (1, 2, 1, 3)
-x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
-s = np.array([1.0, 1.5]).astype(np.float32)
-bias = np.array([0, 1]).astype(np.float32)
-mean = np.array([0, 3]).astype(np.float32)
-var = np.array([1, 1.5]).astype(np.float32)
+# input size: (2, 3, 4, 5)
+x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+s = np.random.randn(3).astype(np.float32)
+bias = np.random.randn(3).astype(np.float32)
+mean = np.random.randn(3).astype(np.float32)
+var = np.random.rand(3).astype(np.float32)
 # using np.bool(1) while generating test data with "'bool' object has no attribute 'dtype'"
 # working around by using np.byte(1).astype(bool)
 training_mode = 1
@@ -2010,7 +2015,7 @@ node = onnx.helper.make_node(
     training_mode=training_mode
 )
 
-# output size: (1, 2, 1, 3)
+# output size: (2, 3, 4, 5)
 expect(node, inputs=[x, s, bias, mean, var],
        outputs=[y, output_mean, output_var],
        name='test_batchnorm_example_training_mode')

diff --git a/docs/TestCoverage.md b/docs/TestCoverage.md
@@ -1323,12 +1323,12 @@ There are 2 test cases, listed as following:
 <summary>batchnormalization</summary>
 
 ```python
-# input size: (1, 2, 1, 3)
-x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
-s = np.array([1.0, 1.5]).astype(np.float32)
-bias = np.array([0, 1]).astype(np.float32)
-mean = np.array([0, 3]).astype(np.float32)
-var = np.array([1, 1.5]).astype(np.float32)
+# input size: (2, 3, 4, 5)
+x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+s = np.random.randn(3).astype(np.float32)
+bias = np.random.randn(3).astype(np.float32)
+mean = np.random.randn(3).astype(np.float32)
+var = np.random.rand(3).astype(np.float32)
 y = _batchnorm_test_mode(x, s, bias, mean, var).astype(np.float32)
 
 node = onnx.helper.make_node(
@@ -1337,7 +1337,7 @@ node = onnx.helper.make_node(
     outputs=['y'],
 )
 
-# output size: (1, 2, 1, 3)
+# output size: (2, 3, 4, 5)
 expect(node, inputs=[x, s, bias, mean, var], outputs=[y],
        name='test_batchnorm_example')
 
@@ -1367,12 +1367,12 @@ expect(node, inputs=[x, s, bias, mean, var], outputs=[y],
 <summary>train</summary>
 
 ```python
-# input size: (1, 2, 1, 3)
-x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
-s = np.array([1.0, 1.5]).astype(np.float32)
-bias = np.array([0, 1]).astype(np.float32)
-mean = np.array([0, 3]).astype(np.float32)
-var = np.array([1, 1.5]).astype(np.float32)
+# input size: (2, 3, 4, 5)
+x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+s = np.random.randn(3).astype(np.float32)
+bias = np.random.randn(3).astype(np.float32)
+mean = np.random.randn(3).astype(np.float32)
+var = np.random.rand(3).astype(np.float32)
 # using np.bool(1) while generating test data with "'bool' object has no attribute 'dtype'"
 # working around by using np.byte(1).astype(bool)
 training_mode = 1
@@ -1385,7 +1385,7 @@ node = onnx.helper.make_node(
     training_mode=training_mode
 )
 
-# output size: (1, 2, 1, 3)
+# output size: (2, 3, 4, 5)
 expect(node, inputs=[x, s, bias, mean, var],
        outputs=[y, output_mean, output_var],
        name='test_batchnorm_example_training_mode')

diff --git a/onnx/backend/test/case/node/batchnorm.py b/onnx/backend/test/case/node/batchnorm.py
@@ -35,12 +35,12 @@ def _batchnorm_training_mode(x, s, bias, mean, var, momentum=0.9, epsilon=1e-5):
 class BatchNormalization(Base):
     @staticmethod
     def export():  # type: () -> None
-        # input size: (1, 2, 1, 3)
-        x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
-        s = np.array([1.0, 1.5]).astype(np.float32)
-        bias = np.array([0, 1]).astype(np.float32)
-        mean = np.array([0, 3]).astype(np.float32)
-        var = np.array([1, 1.5]).astype(np.float32)
+        # input size: (2, 3, 4, 5)
+        x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+        s = np.random.randn(3).astype(np.float32)
+        bias = np.random.randn(3).astype(np.float32)
+        mean = np.random.randn(3).astype(np.float32)
+        var = np.random.rand(3).astype(np.float32)
         y = _batchnorm_test_mode(x, s, bias, mean, var).astype(np.float32)
 
         node = onnx.helper.make_node(
@@ -49,7 +49,7 @@ def export():  # type: () -> None
             outputs=['y'],
         )
 
-        # output size: (1, 2, 1, 3)
+        # output size: (2, 3, 4, 5)
         expect(node, inputs=[x, s, bias, mean, var], outputs=[y],
                name='test_batchnorm_example')
 
@@ -75,12 +75,12 @@ def export():  # type: () -> None
 
     @staticmethod
     def export_train():  # type: () -> None
-        # input size: (1, 2, 1, 3)
-        x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32)
-        s = np.array([1.0, 1.5]).astype(np.float32)
-        bias = np.array([0, 1]).astype(np.float32)
-        mean = np.array([0, 3]).astype(np.float32)
-        var = np.array([1, 1.5]).astype(np.float32)
+        # input size: (2, 3, 4, 5)
+        x = np.random.randn(2, 3, 4, 5).astype(np.float32)
+        s = np.random.randn(3).astype(np.float32)
+        bias = np.random.randn(3).astype(np.float32)
+        mean = np.random.randn(3).astype(np.float32)
+        var = np.random.rand(3).astype(np.float32)
         # using np.bool(1) while generating test data with "'bool' object has no attribute 'dtype'"
         # working around by using np.byte(1).astype(bool)
         training_mode = 1
@@ -93,7 +93,7 @@ def export_train():  # type: () -> None
             training_mode=training_mode
         )
 
-        # output size: (1, 2, 1, 3)
+        # output size: (2, 3, 4, 5)
         expect(node, inputs=[x, s, bias, mean, var],
                outputs=[y, output_mean, output_var],
                name='test_batchnorm_example_training_mode')

diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_1.pb
@@ -1 +1 @@
-BsJٺ�>*������>
+BsJ��6?�a:���V?
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_2.pb
@@ -1 +1 @@
-BbiasJǩ?3�1��9�
+BbiasJ=��?tt���K�
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_3.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_3.pb
@@ -1 +1 @@
-BmeanJr�޾���?�,?
+BmeanJ�D��f�<����
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_4.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/input_4.pb
@@ -1 +1 @@
-BvarJǻy?�[?n�?<
+BvarJc�6=l�L?W��=
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon/test_data_set_0/output_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_1.pb
@@ -1 +1 @@
-BsJٺ�>*������>
+BsJ��6?�a:���V?
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_2.pb
@@ -1 +1 @@
-BbiasJǩ?3�1��9�
+BbiasJ=��?tt���K�
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_3.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_3.pb
@@ -1 +1 @@
-BmeanJr�޾���?�,?
+BmeanJ�D��f�<����
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_4.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/input_4.pb
@@ -1 +1 @@
-BvarJǻy?�[?n�?<
+BvarJc�6=l�L?W��=
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/output_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/output_1.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/output_1.pb
@@ -1 +1 @@
-Boutput_meanJ��¾u�?N2?
+Boutput_meanJX����	2�����
diff --git a/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/output_2.pb b/onnx/backend/test/data/node/test_batchnorm_epsilon_training_mode/test_data_set_0/output_2.pb
@@ -1,2 +1,2 @@
 B
-output_varJ�`v?�/c?P�	>
+output_varJ�>�X?��!>
diff --git a/onnx/backend/test/data/node/test_batchnorm_example/model.onnx b/onnx/backend/test/data/node/test_batchnorm_example/model.onnx
@@ -7,29 +7,29 @@
 vary"BatchNormalizationtest_batchnorm_exampleZ
 x
 
-
 
-
-Z
+
+
+Z
 s
 
 
-Z
+Z
 bias
 
 
-Z
+Z
 mean
 
 
-Z
+Z
 var
 
 
-b
+b
 y
 
-
 
-
-B
+
+
+B
diff --git a/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_1.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_2.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_3.pb b/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_3.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_4.pb b/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/input_4.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_batchnorm_example/test_data_set_0/output_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/model.onnx b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/model.onnx
@@ -9,38 +9,38 @@ output_var"BatchNormalization*
 training_mode�$test_batchnorm_example_training_modeZ
 x
 
-
 
-
-Z
+
+
+Z
 s
 
 
-Z
+Z
 bias
 
 
-Z
+Z
 mean
 
 
-Z
+Z
 var
 
 
-b
+b
 y
 
-
 
-
-b
+
+
+b
 output_mean
 
 
-b
+b
 
 output_var
 
 
-B
+B

diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_1.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_2.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_3.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_3.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_4.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/input_4.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/output_0.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/output_1.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/output_1.pb
diff --git a/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/output_2.pb b/onnx/backend/test/data/node/test_batchnorm_example_training_mode/test_data_set_0/output_2.pb
@@ -1,2 +1,2 @@
-B
-output_varJwww?UU�?
+B
+output_varJ�`v?�/c?P�	>
diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
@@ -1613,6 +1613,11 @@ Y = (X - current_mean) / sqrt(current_var + epsilon) * scale + B
 
 current_mean = ReduceMean(X, axis=all_except_channel_index)
 current_var =  ReduceVar(X, axis=all_except_channel_index)
+
+Notice that ReduceVar refers to the population variance, and it equals to
+sum(sqrd(x_i - x_avg)) / N
+where N is the population size (this formula does not use sample size N - 1).
+
 ```
 
 When training_mode=False:
@@ -1718,7 +1723,8 @@ ONNX_OPERATOR_SET_SCHEMA(
         .Output(
             2,
             "running_var",
-            "The running variance after the BatchNormalization operator.",
+            "The running variance after the BatchNormalization operator. This op uses the population size (N) for "
+            "calculating variance, and not the sample size N-1.",
             "T",
             OpSchema::Optional,
             true,