diff --git a/docs/Changelog.md b/docs/Changelog.md
index d78144e68dd..dc4007adb67 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -13532,7 +13532,7 @@ This version of the operator has been available since version 11 of the default
 <dt><tt>axis</tt> : int (default is 0)</dt>
 <dd>Which axis to split on. A negative value means counting dimensions from the back. Accepted range is [-rank, rank-1] where r = rank(input).</dd>
 <dt><tt>split</tt> : list of ints</dt>
-<dd>length of each output</dd>
+<dd>length of each output. Values should be >= 0.</dd>
 </dl>
 
 #### Inputs
@@ -13588,7 +13588,7 @@ This version of the operator has been available since version 11 of the default
 <dt><tt>input</tt> : T</dt>
 <dd>The tensor to split</dd>
 <dt><tt>split</tt> (optional) : I</dt>
-<dd>Length of each output. It can be either a scalar(tensor of empty shape), or a 1-D tensor. All values must be positive. </dd>
+<dd>Length of each output. It can be either a scalar(tensor of empty shape), or a 1-D tensor. All values must be >= 0. </dd>
 </dl>
 
 #### Outputs
@@ -13965,6 +13965,63 @@ This version of the operator has been available since version 12 of the default
 <dd>Constrain input and output types to all numeric tensors.</dd>
 </dl>
 
+### <a name="Einsum-12"></a>**Einsum-12**</a>
+
+  An einsum of the form ```term1, term2 -> output-term``` produces an output tensor using the following equation
+  
+  ```output[output-term] = reduce-sum( input1[term1] * input2[term] )```
+  
+  where the reduce-sum performs a summation over all the indices occurring in in the input terms (term1, term2)
+  that do not occur in the output-term.
+  
+  The Einsum operator evaluates algebraic tensor operations on a sequence of tensors, using the Einstein summation
+  convention. The equation string contains a comma-separated sequence of lower case letters. Each term corresponds to
+  an operand tensor, and the characters within the terms correspond to operands dimensions.
+  
+  This sequence may be followed by "->" to separate the left and right hand side of the equation.
+  If the equation contains "->" followed by the right-hand side, the explicit (not classical) form of the Einstein
+  summation is performed, and the right-hand side indices indicate output tensor dimensions. In other cases,
+  output indices are (implicitly) set to the alphabetically sorted sequence of indices appearing exactly once in the
+  equation.
+  
+  When a dimension character is repeated in the left-hand side, it represents summation along the dimension.
+  
+  The equation may contain ellipsis ("...") to enable broadcasting. Ellipsis must indicate a fixed number of dimensions.
+  The right-hand side may contain exactly one ellipsis. In implicit mode, the ellipsis dimensions are set to the
+  beginning of the output. The equation string may contain space (U+0020) character.
+
+#### Version
+
+This version of the operator has been available since version 12 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>equation</tt> : string (required)</dt>
+<dd>Einsum expression string.</dd>
+</dl>
+
+#### Inputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Inputs</tt> (variadic) : T</dt>
+<dd>Operands</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Output</tt> : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to all numerical tensor types.</dd>
+</dl>
+
 ### <a name="Gradient-12"></a>**Gradient-12**</a>
 
   Gradient operator computes the partial derivatives of a specific tensor to
diff --git a/docs/Operators.md b/docs/Operators.md
index 353d4e907de..9565f0ed10b 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -37,6 +37,7 @@
   * <a href="#Det">Det</a>
   * <a href="#Div">Div</a>
   * <a href="#Dropout">Dropout</a>
+  * <a href="#Einsum">Einsum</a>
   * <a href="#Elu">Elu</a>
   * <a href="#Equal">Equal</a>
   * <a href="#Erf">Erf</a>
@@ -4398,6 +4399,173 @@ expect(node, inputs=[X], outputs=[Y, Y_Scale, Y_ZeroPoint],
 </details>
 
 
+### <a name="Einsum"></a><a name="einsum">**Einsum**</a>
+
+  An einsum of the form ```term1, term2 -> output-term``` produces an output tensor using the following equation
+  
+  ```output[output-term] = reduce-sum( input1[term1] * input2[term] )```
+  
+  where the reduce-sum performs a summation over all the indices occurring in in the input terms (term1, term2)
+  that do not occur in the output-term.
+  
+  The Einsum operator evaluates algebraic tensor operations on a sequence of tensors, using the Einstein summation
+  convention. The equation string contains a comma-separated sequence of lower case letters. Each term corresponds to
+  an operand tensor, and the characters within the terms correspond to operands dimensions.
+  
+  This sequence may be followed by "->" to separate the left and right hand side of the equation.
+  If the equation contains "->" followed by the right-hand side, the explicit (not classical) form of the Einstein
+  summation is performed, and the right-hand side indices indicate output tensor dimensions. In other cases,
+  output indices are (implicitly) set to the alphabetically sorted sequence of indices appearing exactly once in the
+  equation.
+  
+  When a dimension character is repeated in the left-hand side, it represents summation along the dimension.
+  
+  The equation may contain ellipsis ("...") to enable broadcasting. Ellipsis must indicate a fixed number of dimensions.
+  The right-hand side may contain exactly one ellipsis. In implicit mode, the ellipsis dimensions are set to the
+  beginning of the output. The equation string may contain space (U+0020) character.
+
+#### Version
+
+This version of the operator has been available since version 12 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>equation</tt> : string (required)</dt>
+<dd>Einsum expression string.</dd>
+</dl>
+
+#### Inputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Inputs</tt> (variadic) : T</dt>
+<dd>Operands</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Output</tt> : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to all numerical tensor types.</dd>
+</dl>
+
+
+#### Examples
+
+<details>
+<summary>einsum_batch_diagonal</summary>
+
+```python
+Eqn = '...ii ->...i'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x'],
+    outputs=['y'],
+    equation=Eqn
+)
+
+X = np.random.randn(3, 5, 5)
+Z = einsum_reference_implementation(Eqn, (X,))
+
+expect(node, inputs=[X], outputs=[Z], name='test_einsum_batch_diagonal')
+```
+
+</details>
+
+
+<details>
+<summary>einsum_batch_matmul</summary>
+
+```python
+Eqn = 'bij, bjk -> bik'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x', 'y'],
+    outputs=['z'],
+    equation=Eqn
+)
+
+X = np.random.randn(5, 2, 3)
+Y = np.random.randn(5, 3, 4)
+Z = einsum_reference_implementation(Eqn, (X, Y))
+
+expect(node, inputs=[X, Y], outputs=[Z], name='test_einsum_batch_matmul')
+```
+
+</details>
+
+
+<details>
+<summary>einsum_inner_prod</summary>
+
+```python
+Eqn = 'i,i'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x', 'y'],
+    outputs=['z'],
+    equation=Eqn
+)
+
+X = np.random.randn(5)
+Y = np.random.randn(5)
+Z = einsum_reference_implementation(Eqn, (X, Y))
+
+expect(node, inputs=[X, Y], outputs=[Z], name='test_einsum_inner_prod')
+```
+
+</details>
+
+
+<details>
+<summary>einsum_sum</summary>
+
+```python
+Eqn = 'ij->i'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x'],
+    outputs=['y'],
+    equation=Eqn
+)
+
+X = np.random.randn(3, 4)
+Z = einsum_reference_implementation(Eqn, (X,))
+
+expect(node, inputs=[X], outputs=[Z], name='test_einsum_sum')
+```
+
+</details>
+
+
+<details>
+<summary>einsum_transpose</summary>
+
+```python
+Eqn = 'ij->ji'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x'],
+    outputs=['y'],
+    equation=Eqn
+)
+
+X = np.random.randn(3, 4)
+Y = einsum_reference_implementation(Eqn, (X,))
+
+expect(node, inputs=[X], outputs=[Y], name='test_einsum_transpose')
+```
+
+</details>
+
+
 ### <a name="Elu"></a><a name="elu">**Elu**</a>
 
   Elu takes one input data (Tensor<T>) and produces one output data
@@ -17034,7 +17202,7 @@ Other versions of this operator: <a href="Changelog.md#Split-1">Split-1</a>, <a
 <dt><tt>axis</tt> : int (default is 0)</dt>
 <dd>Which axis to split on. A negative value means counting dimensions from the back. Accepted range is [-rank, rank-1] where r = rank(input).</dd>
 <dt><tt>split</tt> : list of ints</dt>
-<dd>length of each output</dd>
+<dd>length of each output. Values should be >= 0.</dd>
 </dl>
 
 #### Inputs
@@ -17158,6 +17326,27 @@ expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_s
 </details>
 
 
+<details>
+<summary>zero_size_splits</summary>
+
+```python
+input = np.array([]).astype(np.float32)
+
+# Split emtpy tensor to tensors of size zero
+node = onnx.helper.make_node(
+    'Split',
+    inputs=['input'],
+    outputs=['output_1', 'output_2', 'output_3'],
+    split=[0, 0, 0]
+)
+
+expected_outputs = [np.array([]).astype(np.float32), np.array([]).astype(np.float32), np.array([]).astype(np.float32)]
+expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_zero_size_splits')
+```
+
+</details>
+
+
 ### <a name="SplitToSequence"></a><a name="splittosequence">**SplitToSequence**</a>
 
   Split a tensor into a sequence of tensors, along the specified
@@ -17190,7 +17379,7 @@ This version of the operator has been available since version 11 of the default
 <dt><tt>input</tt> : T</dt>
 <dd>The tensor to split</dd>
 <dt><tt>split</tt> (optional) : I</dt>
-<dd>Length of each output. It can be either a scalar(tensor of empty shape), or a 1-D tensor. All values must be positive. </dd>
+<dd>Length of each output. It can be either a scalar(tensor of empty shape), or a 1-D tensor. All values must be >= 0. </dd>
 </dl>
 
 #### Outputs
diff --git a/docs/TestCoverage.md b/docs/TestCoverage.md
index 91fc03e96e2..5261813b9b8 100644
--- a/docs/TestCoverage.md
+++ b/docs/TestCoverage.md
@@ -5,7 +5,7 @@
 * [Overall Test Coverage](#overall-test-coverage)
 # Node Test Coverage
 ## Summary
-Node tests have covered 136/151 (90.07%, 5 generators excluded) common operators.
+Node tests have covered 137/152 (90.13%, 5 generators excluded) common operators.
 
 Node tests have covered 1/1 (100.00%, 0 generators excluded) experimental operators.
 
@@ -2610,6 +2610,107 @@ expect(node, inputs=[X], outputs=[Y, Y_Scale, Y_ZeroPoint],
 </details>
 
 
+### Einsum
+There are 5 test cases, listed as following:
+<details>
+<summary>einsum_batch_diagonal</summary>
+
+```python
+Eqn = '...ii ->...i'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x'],
+    outputs=['y'],
+    equation=Eqn
+)
+
+X = np.random.randn(3, 5, 5)
+Z = einsum_reference_implementation(Eqn, (X,))
+
+expect(node, inputs=[X], outputs=[Z], name='test_einsum_batch_diagonal')
+```
+
+</details>
+<details>
+<summary>einsum_batch_matmul</summary>
+
+```python
+Eqn = 'bij, bjk -> bik'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x', 'y'],
+    outputs=['z'],
+    equation=Eqn
+)
+
+X = np.random.randn(5, 2, 3)
+Y = np.random.randn(5, 3, 4)
+Z = einsum_reference_implementation(Eqn, (X, Y))
+
+expect(node, inputs=[X, Y], outputs=[Z], name='test_einsum_batch_matmul')
+```
+
+</details>
+<details>
+<summary>einsum_inner_prod</summary>
+
+```python
+Eqn = 'i,i'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x', 'y'],
+    outputs=['z'],
+    equation=Eqn
+)
+
+X = np.random.randn(5)
+Y = np.random.randn(5)
+Z = einsum_reference_implementation(Eqn, (X, Y))
+
+expect(node, inputs=[X, Y], outputs=[Z], name='test_einsum_inner_prod')
+```
+
+</details>
+<details>
+<summary>einsum_sum</summary>
+
+```python
+Eqn = 'ij->i'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x'],
+    outputs=['y'],
+    equation=Eqn
+)
+
+X = np.random.randn(3, 4)
+Z = einsum_reference_implementation(Eqn, (X,))
+
+expect(node, inputs=[X], outputs=[Z], name='test_einsum_sum')
+```
+
+</details>
+<details>
+<summary>einsum_transpose</summary>
+
+```python
+Eqn = 'ij->ji'
+node = onnx.helper.make_node(
+    'Einsum',
+    inputs=['x'],
+    outputs=['y'],
+    equation=Eqn
+)
+
+X = np.random.randn(3, 4)
+Y = einsum_reference_implementation(Eqn, (X,))
+
+expect(node, inputs=[X], outputs=[Y], name='test_einsum_transpose')
+```
+
+</details>
+
+
 ### Elu
 There are 2 test cases, listed as following:
 <details>
@@ -9571,7 +9672,7 @@ expect(node, inputs=[x], outputs=[y],
 
 
 ### Split
-There are 3 test cases, listed as following:
+There are 4 test cases, listed as following:
 <details>
 <summary>1d</summary>
 
@@ -9662,6 +9763,25 @@ expected_outputs = [np.array([1., 2.]).astype(np.float32), np.array([3., 4., 5.,
 expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_variable_parts_default_axis')
 ```
 
+</details>
+<details>
+<summary>zero_size_splits</summary>
+
+```python
+input = np.array([]).astype(np.float32)
+
+# Split emtpy tensor to tensors of size zero
+node = onnx.helper.make_node(
+    'Split',
+    inputs=['input'],
+    outputs=['output_1', 'output_2', 'output_3'],
+    split=[0, 0, 0]
+)
+
+expected_outputs = [np.array([]).astype(np.float32), np.array([]).astype(np.float32), np.array([]).astype(np.float32)]
+expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_zero_size_splits')
+```
+
 </details>
 
 
diff --git a/onnx/backend/test/case/model/sequence.py b/onnx/backend/test/case/model/sequence.py
index 1f20a026c5a..c34c3c440a0 100644
--- a/onnx/backend/test/case/model/sequence.py
+++ b/onnx/backend/test/case/model/sequence.py
@@ -238,3 +238,34 @@ def make_graph(
             [pos_at])
         model = onnx.helper.make_model(graph, producer_name='backend-test')
         expect(model, inputs=[x], outputs=[out], name="test_sequence_model7")
+
+        #8th testcase - split zero length
+        seq_split_node = onnx.helper.make_node('SplitToSequence', ['X'], ['seq_1'])
+        seq_len_node = onnx.helper.make_node('SequenceLength', ['seq_1'], ['len'])
+
+        tensor_shape = []  # type: ignore
+        len_shape = []  # type: ignore
+
+        x = np.array([]).astype(np.float32)
+        out_len = np.int64(0)
+
+        graph = onnx.helper.make_graph(
+            nodes=[seq_split_node, seq_len_node],
+            name='Sequence',
+            inputs=[
+                onnx.helper.make_tensor_value_info(
+                    'X',
+                    onnx.TensorProto.FLOAT,
+                    tensor_shape),  # type: ignore
+                onnx.helper.make_tensor_value_info(
+                    'Split',
+                    onnx.TensorProto.INT64,
+                    len_shape)],  # type: ignore
+            outputs=[
+                onnx.helper.make_tensor_value_info(
+                    'len',
+                    onnx.TensorProto.INT64,
+                    len_shape)])  # type: ignore
+
+        model = onnx.helper.make_model(graph, producer_name='backend-test')
+        expect(model, inputs=[x], outputs=[out_len], name="test_sequence_model8")
diff --git a/onnx/backend/test/case/node/einsum.py b/onnx/backend/test/case/node/einsum.py
new file mode 100644
index 00000000000..b8637ce01a6
--- /dev/null
+++ b/onnx/backend/test/case/node/einsum.py
@@ -0,0 +1,97 @@
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np  # type: ignore
+
+import onnx
+from ..base import Base
+from . import expect
+from typing import Tuple, Text
+
+
+def einsum_reference_implementation(Eqn, Operands):  # type: (Text, Tuple[np.ndarray, ...]) -> np.ndarray
+    Z = np.einsum(Eqn, *Operands)
+    return Z
+
+
+class Einsum(Base):
+
+    @staticmethod
+    def export_einsum_transpose():  # type: () -> None
+        Eqn = 'ij->ji'
+        node = onnx.helper.make_node(
+            'Einsum',
+            inputs=['x'],
+            outputs=['y'],
+            equation=Eqn
+        )
+
+        X = np.random.randn(3, 4)
+        Y = einsum_reference_implementation(Eqn, (X,))
+
+        expect(node, inputs=[X], outputs=[Y], name='test_einsum_transpose')
+
+    @staticmethod
+    def export_einsum_sum():  # type: () -> None
+        Eqn = 'ij->i'
+        node = onnx.helper.make_node(
+            'Einsum',
+            inputs=['x'],
+            outputs=['y'],
+            equation=Eqn
+        )
+
+        X = np.random.randn(3, 4)
+        Z = einsum_reference_implementation(Eqn, (X,))
+
+        expect(node, inputs=[X], outputs=[Z], name='test_einsum_sum')
+
+    @staticmethod
+    def export_einsum_batch_diagonal():  # type: () -> None
+        Eqn = '...ii ->...i'
+        node = onnx.helper.make_node(
+            'Einsum',
+            inputs=['x'],
+            outputs=['y'],
+            equation=Eqn
+        )
+
+        X = np.random.randn(3, 5, 5)
+        Z = einsum_reference_implementation(Eqn, (X,))
+
+        expect(node, inputs=[X], outputs=[Z], name='test_einsum_batch_diagonal')
+
+    @staticmethod
+    def export_einsum_inner_prod():  # type: () -> None
+        Eqn = 'i,i'
+        node = onnx.helper.make_node(
+            'Einsum',
+            inputs=['x', 'y'],
+            outputs=['z'],
+            equation=Eqn
+        )
+
+        X = np.random.randn(5)
+        Y = np.random.randn(5)
+        Z = einsum_reference_implementation(Eqn, (X, Y))
+
+        expect(node, inputs=[X, Y], outputs=[Z], name='test_einsum_inner_prod')
+
+    @staticmethod
+    def export_einsum_batch_matmul():  # type: () -> None
+        Eqn = 'bij, bjk -> bik'
+        node = onnx.helper.make_node(
+            'Einsum',
+            inputs=['x', 'y'],
+            outputs=['z'],
+            equation=Eqn
+        )
+
+        X = np.random.randn(5, 2, 3)
+        Y = np.random.randn(5, 3, 4)
+        Z = einsum_reference_implementation(Eqn, (X, Y))
+
+        expect(node, inputs=[X, Y], outputs=[Z], name='test_einsum_batch_matmul')
diff --git a/onnx/backend/test/case/node/split.py b/onnx/backend/test/case/node/split.py
index fac19ee9f79..028b834764c 100644
--- a/onnx/backend/test/case/node/split.py
+++ b/onnx/backend/test/case/node/split.py
@@ -90,3 +90,18 @@ def export_default_values():  # type: () -> None
 
         expected_outputs = [np.array([1., 2.]).astype(np.float32), np.array([3., 4., 5., 6.]).astype(np.float32)]
         expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_variable_parts_default_axis')
+
+    @staticmethod
+    def export_zero_size_splits():  # type: () -> None
+        input = np.array([]).astype(np.float32)
+
+        # Split emtpy tensor to tensors of size zero
+        node = onnx.helper.make_node(
+            'Split',
+            inputs=['input'],
+            outputs=['output_1', 'output_2', 'output_3'],
+            split=[0, 0, 0]
+        )
+
+        expected_outputs = [np.array([]).astype(np.float32), np.array([]).astype(np.float32), np.array([]).astype(np.float32)]
+        expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_zero_size_splits')
diff --git a/onnx/backend/test/data/node/test_einsum_batch_diagonal/model.onnx b/onnx/backend/test/data/node/test_einsum_batch_diagonal/model.onnx
new file mode 100644
index 00000000000..843c949d206
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_batch_diagonal/model.onnx
@@ -0,0 +1,13 @@
+backend-test:w
++
+xy"Einsum*
+equation"...ii ->...i�test_einsum_batch_diagonalZ
+x
+
+
+
+b
+y
+
+
+B
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_batch_diagonal/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_einsum_batch_diagonal/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..5e8787cd07a
Binary files /dev/null and b/onnx/backend/test/data/node/test_einsum_batch_diagonal/test_data_set_0/input_0.pb differ
diff --git a/onnx/backend/test/data/node/test_einsum_batch_diagonal/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_einsum_batch_diagonal/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..e4e63971526
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_batch_diagonal/test_data_set_0/output_0.pb
@@ -0,0 +1 @@
+ByJx����9�?�2�g�?��kZ�?��M	�?�ſu(@:�E��3ꐑ�3�?$`���<�?���B`6�?���;˿"�_}��쿵���E�?g�ZK꿬�W����nL�'���?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_batch_matmul/model.onnx b/onnx/backend/test/data/node/test_einsum_batch_matmul/model.onnx
new file mode 100644
index 00000000000..4c0dbca1462
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_batch_matmul/model.onnx
@@ -0,0 +1,20 @@
+backend-test:�
+1
+x
+yz"Einsum*
+equation"bij, bjk -> bik�test_einsum_batch_matmulZ
+x
+
+
+
+Z
+y
+
+
+
+b
+z
+
+
+
+B
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..5b93576069c
Binary files /dev/null and b/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/input_0.pb differ
diff --git a/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/input_1.pb
new file mode 100644
index 00000000000..e5944834113
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/input_1.pb
@@ -0,0 +1,3 @@
+ByJ����6Q��?3ꐑ�3�?ʣ���h�x�0�W���E)�Q1Dֿ�N�6>�?E��GE��?$`���<�?�`hk��ؿ��͢�XӿSdQy���]�~�d���`����L�����B`6�?6��O�e֨�h	ܿn�%s��Qht3��?����������;˿"�_}��쿚�±��??��X࿮�������+T�ۜ�B���i�?����E�?90�[�?����]L�z�4�&7׿����˄忦\C�׿g�ZK�v���ڞ��O�7[��?��JcǶٿf��J����)w9��?��W����
+�>@���?)���T�?e����?�tR6�:�?�*����nL�'���?������z��������n��⿈Fz�ӿ>?(���?}��(t��|ך���?��g�i��?^�sBt���r�H����?�D~�U�?|-��G��?
+����ǿ��7��!�K&���?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..8d382df70a0
Binary files /dev/null and b/onnx/backend/test/data/node/test_einsum_batch_matmul/test_data_set_0/output_0.pb differ
diff --git a/onnx/backend/test/data/node/test_einsum_inner_prod/model.onnx b/onnx/backend/test/data/node/test_einsum_inner_prod/model.onnx
new file mode 100644
index 00000000000..71095465c6c
Binary files /dev/null and b/onnx/backend/test/data/node/test_einsum_inner_prod/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..60ec8162b62
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+BxJ(����9�?S���,��?��"R�Q�?N1�iY�@�=�|���?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/input_1.pb
new file mode 100644
index 00000000000..9869f3dd629
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/input_1.pb
@@ -0,0 +1 @@
+ByJ(���B�E��2�g�?6��I�_ÿY����l��f)g�>G�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..a9dda2a61dc
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_inner_prod/test_data_set_0/output_0.pb
@@ -0,0 +1 @@
+BzJ�PR<��
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_sum/model.onnx b/onnx/backend/test/data/node/test_einsum_sum/model.onnx
new file mode 100644
index 00000000000..fd3cd270bf6
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_sum/model.onnx
@@ -0,0 +1,12 @@
+backend-test:]
+$
+xy"Einsum*
+equation"ij->i�test_einsum_sumZ
+x
+
+
+b
+y
+
+
+B
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_sum/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_einsum_sum/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..86fd72d2a8c
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_sum/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+BxJ`����9�?S���,��?��"R�Q�?N1�iY�@�=�|���?���B�E��2�g�?6��I�_ÿY����l��f)g�>G�?��p�?���K�D�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_sum/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_einsum_sum/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..a69ca927987
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_sum/test_data_set_0/output_0.pb
@@ -0,0 +1 @@
+ByJ[.܋�@R��0�?H!@ڻ}�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_transpose/model.onnx b/onnx/backend/test/data/node/test_einsum_transpose/model.onnx
new file mode 100644
index 00000000000..051b7ec0fa7
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_transpose/model.onnx
@@ -0,0 +1,12 @@
+backend-test:h
+%
+xy"Einsum*
+equation"ij->ji�test_einsum_transposeZ
+x
+
+
+b
+y
+
+
+B
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_transpose/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_einsum_transpose/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..86fd72d2a8c
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_transpose/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+BxJ`����9�?S���,��?��"R�Q�?N1�iY�@�=�|���?���B�E��2�g�?6��I�_ÿY����l��f)g�>G�?��p�?���K�D�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_einsum_transpose/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_einsum_transpose/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..22d7af00ba9
--- /dev/null
+++ b/onnx/backend/test/data/node/test_einsum_transpose/test_data_set_0/output_0.pb
@@ -0,0 +1 @@
+ByJ`����9�?�=�|���?Y����l��S���,��?���B�E�f)g�>G�?��"R�Q�?�2�g�?��p�?N1�iY�@6��I�_ÿ���K�D�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_split_zero_size_splits/model.onnx b/onnx/backend/test/data/node/test_split_zero_size_splits/model.onnx
new file mode 100644
index 00000000000..163f43d9409
Binary files /dev/null and b/onnx/backend/test/data/node/test_split_zero_size_splits/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..16d4ac7a398
Binary files /dev/null and b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/input_0.pb differ
diff --git a/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..14aa56f56b6
Binary files /dev/null and b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_0.pb differ
diff --git a/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_1.pb b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_1.pb
new file mode 100644
index 00000000000..2f700936b76
Binary files /dev/null and b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_1.pb differ
diff --git a/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_2.pb b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_2.pb
new file mode 100644
index 00000000000..f45e84e61dc
Binary files /dev/null and b/onnx/backend/test/data/node/test_split_zero_size_splits/test_data_set_0/output_2.pb differ
diff --git a/onnx/backend/test/data/simple/test_sequence_model8/model.onnx b/onnx/backend/test/data/simple/test_sequence_model8/model.onnx
new file mode 100644
index 00000000000..b6bf227d498
Binary files /dev/null and b/onnx/backend/test/data/simple/test_sequence_model8/model.onnx differ
diff --git a/onnx/backend/test/data/simple/test_sequence_model8/test_data_set_0/input_0.pb b/onnx/backend/test/data/simple/test_sequence_model8/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..65193ec95dc
Binary files /dev/null and b/onnx/backend/test/data/simple/test_sequence_model8/test_data_set_0/input_0.pb differ
diff --git a/onnx/backend/test/data/simple/test_sequence_model8/test_data_set_0/output_0.pb b/onnx/backend/test/data/simple/test_sequence_model8/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..8a36bae46ae
Binary files /dev/null and b/onnx/backend/test/data/simple/test_sequence_model8/test_data_set_0/output_0.pb differ
diff --git a/onnx/defs/math/defs.cc b/onnx/defs/math/defs.cc
index d5213be0582..d0908f602c8 100644
--- a/onnx/defs/math/defs.cc
+++ b/onnx/defs/math/defs.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <functional>
+#include <algorithm>
 #include "onnx/defs/schema.h"
 #include "onnx/defs/tensor_proto_util.h"
 
@@ -1665,4 +1666,144 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
+void einsumRankInference(
+    ONNX_NAMESPACE::InferenceContext& ctx, std::string equation) {
+
+  const size_t numInputs = ctx.getNumInputs();
+  if (numInputs < 1 || !hasNInputShapes(ctx, static_cast<int>(numInputs))) {
+    return;
+  }
+
+  auto* output_shape = getOutputShape(ctx, 0);
+  std::string  left_equation;
+
+  equation.erase(std::remove(equation.begin(), equation.end(), ' '), equation.end()); // Remove space char
+  auto mid_index = equation.find("->");
+  if (mid_index != std::string::npos) {
+    // Separate right and left hand sides of the equation
+    left_equation = equation.substr(0, mid_index);
+  } else {
+    // No right hand side
+    left_equation = equation;
+  }
+
+  std::string term;
+  size_t num_operands = 0;
+  size_t num_ellipsis = 0;
+  size_t num_ellipsis_indices = 0;
+
+  // Parse the left-hand side
+  std::stringstream str(left_equation);
+  while(std::getline(str, term, ',')) {
+    auto ellipsis_index = term.find("...");
+    if (ellipsis_index != std::string::npos) {
+      if (numInputs <= num_operands) {
+        fail_shape_inference("Number of input tensors does not match the operands in the equation.");
+      }
+      // If there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions
+      size_t rank = ctx.getInputType(num_operands)->tensor_type().shape().dim_size();
+      if (num_ellipsis == 0) {
+        num_ellipsis_indices = rank - term.size() + 3;
+      } else { // ellipsis has been seen before. Check that if dimensions are compatible
+        if (num_ellipsis_indices != rank - term.size() + 3) {
+          fail_shape_inference("Ellipsis represents incompatible dimensions.");
+        }
+      }
+      num_ellipsis++;
+    }
+    num_operands++;
+  }
+
+  if (numInputs != num_operands) {
+    fail_shape_inference("Number of input tensors does not match the operands in the equation.");
+  }
+
+  const size_t number_of_letters = 26;
+  size_t num_letter_occurrences[number_of_letters] = {0};
+  // Parse the provided right-hand side
+  if (mid_index != std::string::npos) {
+    std::string right_equation = equation.substr(mid_index + 2);
+    auto right_ellipsis_index = right_equation.find("...");
+    if (right_ellipsis_index != std::string::npos) { // Right-hand side contains ellipsis
+      for (size_t i = 0; i < num_ellipsis; ++i) {
+        output_shape->add_dim();
+      }
+    }
+    for (char c: right_equation) { // Add a dimension per each character in right hand equation
+      if (c != '.') {
+        output_shape->add_dim();
+      }
+    }
+  } else { // Infer the dimension for right-hand side
+    // If there's an ellipsis, add it's corresponding dimensions
+    for (size_t i = 0; i < num_ellipsis_indices; i++) {
+      output_shape->add_dim();
+    }
+    for (size_t i = 0; i < left_equation.size(); i++) { // Count chars that appear exactly once on left hand side
+      if ((left_equation.at(i) != ',') && (left_equation.at(i) != '.')) {
+        num_letter_occurrences[left_equation.at(i) - 'a']++;
+      }
+    }
+    for (size_t index = 0; index < number_of_letters; index++) {
+      if (num_letter_occurrences[index] == 1) {
+        output_shape->add_dim();
+      }
+    }
+  }
+}
+
+static const char* Einsum_ver12_doc = R"DOC(
+An einsum of the form ```term1, term2 -> output-term``` produces an output tensor using the following equation
+
+```output[output-term] = reduce-sum( input1[term1] * input2[term] )```
+
+where the reduce-sum performs a summation over all the indices occurring in in the input terms (term1, term2)
+that do not occur in the output-term.
+
+The Einsum operator evaluates algebraic tensor operations on a sequence of tensors, using the Einstein summation
+convention. The equation string contains a comma-separated sequence of lower case letters. Each term corresponds to
+an operand tensor, and the characters within the terms correspond to operands dimensions.
+
+This sequence may be followed by "->" to separate the left and right hand side of the equation.
+If the equation contains "->" followed by the right-hand side, the explicit (not classical) form of the Einstein
+summation is performed, and the right-hand side indices indicate output tensor dimensions. In other cases,
+output indices are (implicitly) set to the alphabetically sorted sequence of indices appearing exactly once in the
+equation.
+
+When a dimension character is repeated in the left-hand side, it represents summation along the dimension.
+
+The equation may contain ellipsis ("...") to enable broadcasting. Ellipsis must indicate a fixed number of dimensions.
+The right-hand side may contain exactly one ellipsis. In implicit mode, the ellipsis dimensions are set to the
+beginning of the output. The equation string may contain space (U+0020) character.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Einsum,
+    12,
+    OpSchema()
+        .SetDoc(Einsum_ver12_doc)
+        .Attr(
+            "equation",
+            "Einsum expression string.",
+            AttributeProto::STRING)
+        .Input(0,
+            "Inputs",
+            "Operands",
+            "T",
+            OpSchema::Variadic)
+        .Output(0, "Output", "Output tensor", "T")
+        .TypeConstraint(
+            "T",
+            OpSchema::all_numeric_types(),
+            "Constrain input and output types to all numerical tensor types.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          // Type inference
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          std::string equation = getAttribute(ctx, "equation", "");
+          if (equation.compare("") == 0) {
+            return;
+          }
+	        einsumRankInference(ctx, equation);
+        }));
+
 } // namespace ONNX_NAMESPACE
diff --git a/onnx/defs/operator_sets.h b/onnx/defs/operator_sets.h
index fce820ed7ce..97fab52439d 100644
--- a/onnx/defs/operator_sets.h
+++ b/onnx/defs/operator_sets.h
@@ -713,6 +713,7 @@ class OpSet_Onnx_ver11 {
 // Forward declarations for ai.onnx version 12
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ArgMax);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ArgMin);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, Einsum);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, MaxPool);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ReduceMax);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ReduceMin);
@@ -724,6 +725,7 @@ class OpSet_Onnx_ver12 {
   static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ArgMax)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ArgMin)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, Einsum)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, MaxPool)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ReduceMax)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 12, ReduceMin)>());
diff --git a/onnx/defs/sequence/defs.cc b/onnx/defs/sequence/defs.cc
index ed12f763ccc..571729cfb7b 100644
--- a/onnx/defs/sequence/defs.cc
+++ b/onnx/defs/sequence/defs.cc
@@ -346,8 +346,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             1,
             "split",
             "Length of each output. "
-            "It can be either a scalar(tensor of empty shape), or a 1-D tensor. "
-            "All values must be positive. ",
+            "It can be either a scalar(tensor of empty shape), or a 1-D tensor. All values must be >= 0. ",
             "I",
             OpSchema::Optional)
         .Output(
diff --git a/onnx/defs/tensor/defs.cc b/onnx/defs/tensor/defs.cc
index 880125cdf90..fa21f7fc9f4 100644
--- a/onnx/defs/tensor/defs.cc
+++ b/onnx/defs/tensor/defs.cc
@@ -399,7 +399,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "where r = rank(input).",
             AttributeProto::INT,
             static_cast<int64_t>(0))
-        .Attr("split", "length of each output", AttributeProto::INTS, OPTIONAL)
+        .Attr("split", "length of each output. Values should be >= 0.", AttributeProto::INTS, OPTIONAL)
         .SetDoc(Split_ver11_doc)
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           for (int i = 0; i < static_cast<int>(ctx.getNumOutputs()); ++i) {
diff --git a/onnx/onnx-ml.proto b/onnx/onnx-ml.proto
index 9067da40f9c..2647f5fc896 100644
--- a/onnx/onnx-ml.proto
+++ b/onnx/onnx-ml.proto
@@ -238,11 +238,15 @@ message NodeProto {
 //      nodes cannot produce values with names identical to inference
 //      initializers.
 message TrainingInfoProto {
-  // This field is a graph that describes a way to compute the initial tensors
+  // This field describes a graph to compute the initial tensors
   // upon starting the training process. Initialization graph has no input
   // and can have multiple outputs. Assigning those outputs to the associated
   // initializers follows the relation defined by update_binding as if
-  // TrainingInfoProto.initialization is another training algorithm.
+  // TrainingInfoProto.initialization is another training algorithm. Usually,
+  // trainable tensors in neural networks are randomly initialized.
+  // To achieve that, for each tensor, the user can put a random operator such
+  // as RandomNormal or RandomUniform in TrainingInfoProto.initialization.node
+  // and assign its random output to the specific tensor using "update_binding".
   //
   // By default, this field is an empty graph and its evaluation does not
   // produce any output.
diff --git a/onnx/onnx-ml.proto3 b/onnx/onnx-ml.proto3
index c9d3d55af03..886fe358a46 100644
--- a/onnx/onnx-ml.proto3
+++ b/onnx/onnx-ml.proto3
@@ -238,11 +238,15 @@ message NodeProto {
 //      nodes cannot produce values with names identical to inference
 //      initializers.
 message TrainingInfoProto {
-  // This field is a graph that describes a way to compute the initial tensors
+  // This field describes a graph to compute the initial tensors
   // upon starting the training process. Initialization graph has no input
   // and can have multiple outputs. Assigning those outputs to the associated
   // initializers follows the relation defined by update_binding as if
-  // TrainingInfoProto.initialization is another training algorithm.
+  // TrainingInfoProto.initialization is another training algorithm. Usually,
+  // trainable tensors in neural networks are randomly initialized.
+  // To achieve that, for each tensor, the user can put a random operator such
+  // as RandomNormal or RandomUniform in TrainingInfoProto.initialization.node
+  // and assign its random output to the specific tensor using "update_binding".
   //
   // By default, this field is an empty graph and its evaluation does not
   // produce any output.
diff --git a/onnx/onnx.in.proto b/onnx/onnx.in.proto
index 0fbaffc7c08..0d823f0deed 100644
--- a/onnx/onnx.in.proto
+++ b/onnx/onnx.in.proto
@@ -235,11 +235,15 @@ message NodeProto {
 //      nodes cannot produce values with names identical to inference
 //      initializers.
 message TrainingInfoProto {
-  // This field is a graph that describes a way to compute the initial tensors
+  // This field describes a graph to compute the initial tensors
   // upon starting the training process. Initialization graph has no input
   // and can have multiple outputs. Assigning those outputs to the associated
   // initializers follows the relation defined by update_binding as if
-  // TrainingInfoProto.initialization is another training algorithm.
+  // TrainingInfoProto.initialization is another training algorithm. Usually,
+  // trainable tensors in neural networks are randomly initialized.
+  // To achieve that, for each tensor, the user can put a random operator such
+  // as RandomNormal or RandomUniform in TrainingInfoProto.initialization.node
+  // and assign its random output to the specific tensor using "update_binding".
   //
   // By default, this field is an empty graph and its evaluation does not
   // produce any output.
diff --git a/onnx/onnx.proto b/onnx/onnx.proto
index 14890e265e0..a4068b1b10d 100644
--- a/onnx/onnx.proto
+++ b/onnx/onnx.proto
@@ -236,11 +236,15 @@ message NodeProto {
 //      nodes cannot produce values with names identical to inference
 //      initializers.
 message TrainingInfoProto {
-  // This field is a graph that describes a way to compute the initial tensors
+  // This field describes a graph to compute the initial tensors
   // upon starting the training process. Initialization graph has no input
   // and can have multiple outputs. Assigning those outputs to the associated
   // initializers follows the relation defined by update_binding as if
-  // TrainingInfoProto.initialization is another training algorithm.
+  // TrainingInfoProto.initialization is another training algorithm. Usually,
+  // trainable tensors in neural networks are randomly initialized.
+  // To achieve that, for each tensor, the user can put a random operator such
+  // as RandomNormal or RandomUniform in TrainingInfoProto.initialization.node
+  // and assign its random output to the specific tensor using "update_binding".
   //
   // By default, this field is an empty graph and its evaluation does not
   // produce any output.
diff --git a/onnx/onnx.proto3 b/onnx/onnx.proto3
index d5e1a5e6d63..17766591952 100644
--- a/onnx/onnx.proto3
+++ b/onnx/onnx.proto3
@@ -236,11 +236,15 @@ message NodeProto {
 //      nodes cannot produce values with names identical to inference
 //      initializers.
 message TrainingInfoProto {
-  // This field is a graph that describes a way to compute the initial tensors
+  // This field describes a graph to compute the initial tensors
   // upon starting the training process. Initialization graph has no input
   // and can have multiple outputs. Assigning those outputs to the associated
   // initializers follows the relation defined by update_binding as if
-  // TrainingInfoProto.initialization is another training algorithm.
+  // TrainingInfoProto.initialization is another training algorithm. Usually,
+  // trainable tensors in neural networks are randomly initialized.
+  // To achieve that, for each tensor, the user can put a random operator such
+  // as RandomNormal or RandomUniform in TrainingInfoProto.initialization.node
+  // and assign its random output to the specific tensor using "update_binding".
   //
   // By default, this field is an empty graph and its evaluation does not
   // produce any output.
diff --git a/onnx/test/shape_inference_test.py b/onnx/test/shape_inference_test.py
index 012f3f750df..8e4ff8c35ca 100644
--- a/onnx/test/shape_inference_test.py
+++ b/onnx/test/shape_inference_test.py
@@ -2746,6 +2746,52 @@ def test_gatherelements_indices_missing_shape(self):  # type: () -> None
             [])
         self._assert_inferred(graph, [make_tensor_value_info('y', TensorProto.FLOAT, None)])  # type: ignore
 
+    def test_einsum_transpose(self):  # type: () -> None
+        graph = self._make_graph(
+            [('x', TensorProto.FLOAT, (3, 4))],
+            [make_node('Einsum', ['x'], ['y'], equation='ij->ji')],
+            [],)
+        self._assert_inferred(graph, [make_tensor_value_info('y', TensorProto.FLOAT, (None, None))])  # type: ignore
+
+    def test_einsum_sum_along_dim(self):  # type: () -> None
+        graph = self._make_graph(
+            [('x', TensorProto.FLOAT, (3, 4))],
+            [make_node('Einsum', ['x'], ['y'], equation='i j->i ')],
+            [],)
+        self._assert_inferred(graph, [make_tensor_value_info('y', TensorProto.FLOAT, (None, ))])  # type: ignore
+
+    def test_einsum_ellipsis(self):  # type: () -> None
+        graph = self._make_graph(
+            [('x', TensorProto.FLOAT, (3, 4))],
+            [make_node('Einsum', ['x'], ['y'], equation='... ii ->... i')],
+            [],)
+        self._assert_inferred(graph, [make_tensor_value_info('y', TensorProto.FLOAT, (None, None))])  # type: ignore
+
+    def test_einsum_batch_matmul(self):  # type: () -> None
+        graph = self._make_graph(
+            [('x', TensorProto.FLOAT, (5, 2, 3)),
+             ('y', TensorProto.FLOAT, (5, 3, 4))],
+            [make_node('Einsum', ['x', 'y'], ['z'], equation='bij , b jk-> bik')],
+            [],)
+        self._assert_inferred(graph, [make_tensor_value_info('z', TensorProto.FLOAT, (None, None, None))])  # type: ignore
+
+    def test_einsum_left_hand_eqn(self):  # type: () -> None
+        graph = self._make_graph(
+            [('x', TensorProto.FLOAT, (2, 3)),
+             ('y', TensorProto.FLOAT, (3, 4))],
+            [make_node('Einsum', ['x', 'y'], ['z'], equation='ij,kl')],
+            [],)
+        self._assert_inferred(graph, [make_tensor_value_info('z', TensorProto.FLOAT, (None, None, None, None))])  # type: ignore
+
+    def test_einsum_incorrect_num_inputs(self):  # type: () -> None
+        graph = self._make_graph(
+            [("x", TensorProto.FLOAT, (2, 3)),
+             ("y", TensorProto.FLOAT, (2, 3)),
+             ("z", TensorProto.FLOAT, (2, 3))],
+            [make_node('Einsum', ['x', 'y'], ['z'], equation='i,...j, k, l-> i')],
+            [])
+        self.assertRaises(checker.ValidationError, self._inferred, graph)
+
 
 if __name__ == '__main__':
     unittest.main()