diff --git a/docs/Changelog.md b/docs/Changelog.md
index 8f253ed0edc..8107fbfd86c 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -4629,7 +4629,7 @@ This version of the operator has been available since version 2 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -10750,7 +10750,7 @@ This version of the operator has been available since version 11 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(float16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -14497,7 +14497,7 @@ This version of the operator has been available since version 12 of the default
    ```
    output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
 
    `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
    ```
@@ -15564,7 +15564,7 @@ This version of the operator has been available since version 13 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -18593,7 +18593,7 @@ This version of the operator has been available since version 13 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -22627,7 +22627,7 @@ This version of the operator has been available since version 18 of the default
    ```
    output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
 
    `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
    ```
@@ -24673,9 +24673,9 @@ This version of the operator has been available since version 21 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain input types. Casting from complex is not supported.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain output types. Casting to complex is not supported.</dd>
 </dl>
 
@@ -25878,6 +25878,2357 @@ This version of the operator has been available since version 21 of the default
 <dd>Constrain input and output types to all tensor types up to IRv10.</dd>
 </dl>
 
+## Version 22 of the default ONNX operator set
+### <a name="Acos-22"></a>**Acos-22**</a>
+
+  Calculates the arccosine (inverse of cosine) of the given input tensor, element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The arccosine of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Acosh-22"></a>**Acosh-22**</a>
+
+  Calculates the hyperbolic arccosine of the given input tensor element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The hyperbolic arccosine values of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Asin-22"></a>**Asin-22**</a>
+
+  Calculates the arcsine (inverse of sine) of the given input tensor, element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The arcsine of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Asinh-22"></a>**Asinh-22**</a>
+
+  Calculates the hyperbolic arcsine of the given input tensor element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The hyperbolic arcsine values of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Atan-22"></a>**Atan-22**</a>
+
+  Calculates the arctangent (inverse of tangent) of the given input tensor, element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The arctangent of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Atanh-22"></a>**Atanh-22**</a>
+
+  Calculates the hyperbolic arctangent of the given input tensor element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The hyperbolic arctangent values of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="AveragePool-22"></a>**AveragePool-22**</a>
+
+  AveragePool consumes an input tensor X and applies average pooling across
+   the tensor according to kernel sizes, stride sizes, and pad lengths.
+   average pooling consisting of computing the average on all values of a
+   subset of the input tensor according to the kernel size and downsampling the
+   data into the output tensor Y for further processing. The output spatial shape is calculated differently
+   depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
+   With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+   ```
+   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   ```
+   or
+   ```
+   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   ```
+   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+   ```
+   VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
+   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+   ```
+   or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
+   ```
+   VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
+   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
+   ```
+   And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+   ```
+   pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
+   ```
+   The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).
+
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>auto_pad</tt> : string (default is NOTSET)</dt>
+<dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.</dd>
+<dt><tt>ceil_mode</tt> : int (default is 0)</dt>
+<dd>Whether to use ceil or floor (default) to compute the output shape.</dd>
+<dt><tt>count_include_pad</tt> : int (default is 0)</dt>
+<dd>Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.</dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd>Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.</dd>
+<dt><tt>kernel_shape</tt> : list of ints (required)</dt>
+<dd>The size of the kernel along each axis.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size. Optionally, if dimension denotation is in effect, the operation expects the input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor from average or max pooling across the input tensor. Dimensions will vary based on various kernel, stride, and pad sizes. Floor value of the dimension is used</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Bernoulli-22"></a>**Bernoulli-22**</a>
+
+  Draws binary random numbers (0 or 1) from a Bernoulli distribution. The input tensor should be a tensor
+  containing probabilities p (a value in the range [0,1]) to be used for drawing the binary random number,
+  where an output of 1 is produced with probability p and an output of 0 is produced with probability (1-p).
+
+  This operator is non-deterministic and may not produce the same values in different
+  implementations (even if a seed is specified).
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dtype</tt> : int</dt>
+<dd>The data type for the elements of the output tensor. if not specified, we will use the data type of the input tensor.</dd>
+<dt><tt>seed</tt> : float</dt>
+<dd>(Optional) Seed to the random generator, if not specified we will auto generate one.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T1</dt>
+<dd>All values in input have to be in the range:[0, 1].</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T2</dt>
+<dd>The returned output tensor only has values 0 or 1, same shape as input tensor.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input types to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(bool)</dt>
+<dd>Constrain output types to all numeric tensors and bool tensors.</dd>
+</dl>
+
+### <a name="Conv-22"></a>**Conv-22**</a>
+
+  The convolution operator consumes an input tensor and a filter, and
+  computes the output.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>auto_pad</tt> : string (default is NOTSET)</dt>
+<dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.</dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd>dilation value along each spatial axis of the filter. If not present, the dilation defaults is 1 along each spatial axis.</dd>
+<dt><tt>group</tt> : int (default is 1)</dt>
+<dd>number of groups input channels and output channels are divided into.</dd>
+<dt><tt>kernel_shape</tt> : list of ints</dt>
+<dd>The shape of the convolution kernel. If not present, should be inferred from input W.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>Stride along each spatial axis. If not present, the stride defaults is 1 along each spatial axis.</dd>
+</dl>
+
+#### Inputs (2 - 3)
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from previous layer; has size (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and width. Note that this is for the 2D image. Otherwise the size is (N x C x D1 x D2 ... x Dn). Optionally, if dimension denotation is in effect, the operation expects input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].</dd>
+<dt><tt>W</tt> (differentiable) : T</dt>
+<dd>The weight tensor that will be used in the convolutions; has size (M x C/group x kH x kW), where C is the number of channels, and kH and kW are the height and width of the kernel, and M is the number of feature maps. For more than 2 dimensions, the kernel shape will be (M x C/group x k1 x k2 x ... x kn), where (k1 x k2 x ... kn) is the dimension of the kernel. Optionally, if dimension denotation is in effect, the operation expects the weight tensor to arrive with the dimension denotation of [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. Assuming zero based indices for the shape array, X.shape[1] == (W.shape[1] * group) == C and W.shape[0] mod G == 0. Or in other words FILTER_IN_CHANNEL multiplied by the number of groups should be equal to DATA_CHANNEL and the number of feature maps M should be a multiple of the number of groups G.</dd>
+<dt><tt>B</tt> (optional, differentiable) : T</dt>
+<dd>Optional 1D bias to be added to the convolution, has size of M.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor that contains the result of the convolution. The output dimensions are functions of the kernel size, stride size, and pad lengths.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="ConvTranspose-22"></a>**ConvTranspose-22**</a>
+
+  The convolution transpose operator consumes an input tensor and a filter,
+  and computes the output.
+
+  If the pads parameter is provided the shape of the output is calculated via the following equation:
+
+    output_shape[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - pads[start_i] - pads[end_i]
+
+  output_shape can also be explicitly specified in which case pads values are auto generated using these equations:
+
+    total_padding[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]
+    If (auto_pads == SAME_UPPER): pads[start_i] = total_padding[i]/2; pads[end_i] = total_padding[i] - (total_padding[i]/2)
+    Else: pads[start_i] = total_padding[i] - (total_padding[i]/2); pads[end_i] = (total_padding[i]/2).
+
+
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>auto_pad</tt> : string (default is NOTSET)</dt>
+<dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = input_shape[i] * strides[i]` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.</dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd>dilation value along each spatial axis of the filter. If not present, the dilation defaults to 1 along each spatial axis.</dd>
+<dt><tt>group</tt> : int (default is 1)</dt>
+<dd>number of groups input channels and output channels are divided into.</dd>
+<dt><tt>kernel_shape</tt> : list of ints</dt>
+<dd>The shape of the convolution kernel. If not present, should be inferred from input W.</dd>
+<dt><tt>output_padding</tt> : list of ints</dt>
+<dd>Additional elements added to the side with higher coordinate indices in the output. Each padding value in "output_padding" must be less than the corresponding stride/dilation dimension. By default, this attribute is a zero vector. Note that this attribute doesn't directly affect the computed output values. It only controls the selection of the computed values, so changing this attribute only adds or removes output elements. If "output_shape" is explicitly provided, "output_padding" does not contribute additional size to "output_shape" but participates in the computation of the needed padding amount. This is also called adjs or adjustment in some frameworks.</dd>
+<dt><tt>output_shape</tt> : list of ints</dt>
+<dd>The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified pads values are ignored. See doc for details for equations to generate pads. Note that the output_shape attribute value should not include dimensions for batch size and channels, which are automatically inferred.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.</dd>
+</dl>
+
+#### Inputs (2 - 3)
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from previous layer; has size (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and width. Note that this is for the 2D image. Otherwise the size is (N x C x D1 x D2 ... x Dn)</dd>
+<dt><tt>W</tt> (differentiable) : T</dt>
+<dd>The weight tensor that will be used in the convolutions; has size (C x M/group x kH x kW), where C is the number of channels, and kH and kW are the height and width of the kernel, and M is the number of feature maps. For more than 2 dimensions, the weight shape will be (C x M/group x k1 x k2 x ... x kn), where (k1 x k2 x ... x kn) is the dimension of the kernel. The number of channels in the output should be equal to W.shape[1] * group (assuming zero based indices of the shape array)</dd>
+<dt><tt>B</tt> (optional, differentiable) : T</dt>
+<dd>Optional 1D bias to be added to the convolution, has size of M.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor that contains the result of the convolution. The output dimensions are functions of the kernel size, stride size, pad lengths and group count. The number of channels in the output should be equal to W.shape[1] * group (assuming zero based indices of the shape array)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Cos-22"></a>**Cos-22**</a>
+
+  Calculates the cosine of the given input tensor, element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The cosine of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Cosh-22"></a>**Cosh-22**</a>
+
+  Calculates the hyperbolic cosine of the given input tensor element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The hyperbolic cosine values of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="DeformConv-22"></a>**DeformConv-22**</a>
+
+  Performs deformable convolution as described in https://arxiv.org/abs/1703.06211 and https://arxiv.org/abs/1811.11168.
+  This operator specification supports the general N-D case. Note that most common use cases have 2D or 3D data.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd>Dilation value along each spatial axis of the kernel. Default is 1 along each axis.</dd>
+<dt><tt>group</tt> : int (default is 1)</dt>
+<dd>Number of groups the input and output channels, C and oC, are divided into. C and oC must both be divisible by group. Default is 1.</dd>
+<dt><tt>kernel_shape</tt> : list of ints</dt>
+<dd>Shape of the convolution kernel. If not present, it is inferred from the shape of input W.</dd>
+<dt><tt>offset_group</tt> : int (default is 1)</dt>
+<dd>Number of groups of offset. C must be divisible by offset_group. Default is 1.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>Padding for the beginning and end along each spatial axis. The values represent the number of pixels added to the beginning and end of the corresponding axis and can take any nonnegative value. The format should be as follows: [x1_begin, x2_begin, ..., x1_end, x2_end, ...], where xi_begin is the number of pixels added at the beginning of axis `i` and xi_end is the number of pixels added at the end of axis `i`. Default is 0 along each axis.</dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>Stride along each spatial axis. Default is 1 along each axis.</dd>
+</dl>
+
+#### Inputs (3 - 5)
+
+<dl>
+<dt><tt>X</tt> : T</dt>
+<dd>Input data tensor. For 2D image data, it has shape (N, C, H, W) where N is the batch size, C is the number of input channels, and H and W are the height and width. In general, the shape is (N, C, D1, D2, ... , Dn) for n-dimensional data, where D1 to Dn are the spatial dimension sizes. Most common use cases have n = 2 or 3.</dd>
+<dt><tt>W</tt> : T</dt>
+<dd>Weight tensor that will be used in the convolutions. It has shape (oC, C/group, kH, kW), where oC is the number of output channels and kH and kW are the kernel height and width. For more than 2 dimensions, it has shape (oC, C/group, k1, k2, ... , kn).</dd>
+<dt><tt>offset</tt> : T</dt>
+<dd>Offset tensor denoting the offset for the sampling locations in the convolution kernel. It has shape (N, offset_group * kH * kW * 2, oH, oW) for 2D data or (N, offset_group * k1 * k2 * ... * kn * n, o1, o2, ... , on) for nD data. Use linear interpolationfor fractional offset values. Sampling locations outside of the padded input tensor gives zero.</dd>
+<dt><tt>B</tt> (optional) : T</dt>
+<dd>Optional 1D bias of length oC to be added to the convolution. Default is a tensor of zeros.</dd>
+<dt><tt>mask</tt> (optional) : T</dt>
+<dd>The mask tensor to be applied to each position in the convolution kernel. It has shape (N, offset_group * kH * kW, oH, oW) for 2D data or (N, offset_group * k1 * k2 * ... * kn * n, o1, o2, ... , on) for nD data. Default is a tensor of ones.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T</dt>
+<dd>Output data tensor that contains the result of convolution. It has shape (N, oC, oH, oW) for 2D data or (N, oC, o1, o2, ..., on) for nD data</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Det-22"></a>**Det-22**</a>
+
+  Det calculates determinant of a square matrix or batches of square matrices.
+  Det takes one input tensor of shape `[*, M, M]`, where `*` is zero or more batch dimensions,
+  and the inner-most 2 dimensions form square matrices.
+  The output is a tensor of shape `[*]`, containing the determinants of all input submatrices.
+  e.g., When the input is 2-D, the output is a scalar(shape is empty: `[]`).
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to floating-point tensors.</dd>
+</dl>
+
+### <a name="Dropout-22"></a>**Dropout-22**</a>
+
+  Dropout takes an input floating-point tensor, an optional input ratio (floating-point scalar) and an optional input training_mode (boolean scalar). It produces two tensor outputs,
+  output (floating-point tensor) and mask (optional `Tensor<bool>`). If `training_mode` is true then the output Y will be a random dropout;
+  Note that this Dropout scales the masked input data by the following equation, so to convert the trained model into inference mode,
+  the user can simply not pass `training_mode` input or set it to false.
+  ```
+  output = scale * data * mask,
+  ```
+  where
+  ```
+  scale = 1. / (1. - ratio).
+  ```
+  This operator has **optional** inputs/outputs. See [the doc](IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>seed</tt> : int</dt>
+<dd>(Optional) Seed to the random generator, if not specified we will auto generate one.</dd>
+</dl>
+
+#### Inputs (1 - 3)
+
+<dl>
+<dt><tt>data</tt> (differentiable) : T</dt>
+<dd>The input data as Tensor.</dd>
+<dt><tt>ratio</tt> (optional, non-differentiable) : T1</dt>
+<dd>The ratio of random dropout, with value in [0, 1). If this input was not set, or if it was set to 0, the output would be a simple copy of the input. If it's non-zero, output will be a random dropout of the scaled input, which is typically the case during training. It is an optional value, if not specified it will default to 0.5.</dd>
+<dt><tt>training_mode</tt> (optional, non-differentiable) : T2</dt>
+<dd>If set to true then it indicates dropout is being used for training. It is an optional value hence unless specified explicitly, it is false. If it is false, ratio is ignored and the operation mimics inference mode where nothing will be dropped from the input data and if mask is requested as output it will contain all ones.</dd>
+</dl>
+
+#### Outputs (1 - 2)
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The output.</dd>
+<dt><tt>mask</tt> (optional, non-differentiable) : T2</dt>
+<dd>The output mask.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dd>Constrain input 'ratio' types to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(bool)</dt>
+<dd>Constrain output 'mask' types to boolean tensors.</dd>
+</dl>
+
+### <a name="Elu-22"></a>**Elu-22**</a>
+
+  Elu takes one input data (Tensor<T>) and produces one output data
+  (Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
+  0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
+
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>alpha</tt> : float (default is 1.0)</dt>
+<dd>Coefficient of ELU.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>1D input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>1D output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="EyeLike-22"></a>**EyeLike-22**</a>
+
+  Generate a 2D tensor (matrix) with ones on the diagonal and zeros everywhere else. Only 2D
+  tensors are supported, i.e. input T1 must be of rank 2. The shape of the output tensor is the
+  same as the input tensor. The data type can be specified by the 'dtype' argument. If
+  'dtype' is not specified, then the type of input tensor is used. By default, the main diagonal
+  is populated with ones, but attribute 'k' can be used to populate upper or lower diagonals.
+  The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
+  TensorProto message and be valid as an output type.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dtype</tt> : int</dt>
+<dd>(Optional) The data type for the elements of the output tensor. If not specified,the data type of the input tensor T1 is used. If input tensor T1 is also notspecified, then type defaults to 'float'.</dd>
+<dt><tt>k</tt> : int (default is 0)</dt>
+<dd>(Optional) Index of the diagonal to be populated with ones. Default is 0. If T2 is the output, this op sets T2[i, i+k] = 1. k = 0 populates the main diagonal, k > 0 populates an upper diagonal,  and k < 0 populates a lower diagonal.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T1</dt>
+<dd>2D input tensor to copy shape, and optionally, type information from.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T2</dt>
+<dd>Output tensor, same shape as input tensor T1.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(bool)</dt>
+<dd>Constrain input types. Strings and complex are not supported.</dd>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(bool)</dt>
+<dd>Constrain output types. Strings and complex are not supported.</dd>
+</dl>
+
+### <a name="GRU-22"></a>**GRU-22**</a>
+
+  Computes an one-layer GRU. This operator is usually supported via some custom
+  implementation such as CuDNN.
+
+  Notations:
+
+  * `X` - input tensor
+  * `z` - update gate
+  * `r` - reset gate
+  * `h` - hidden gate
+  * `t` - time step (t-1 means previous time step)
+  * `W[zrh]` - W parameter weight matrix for update, reset, and hidden gates
+  * `R[zrh]` - R recurrence weight matrix for update, reset, and hidden gates
+  * `Wb[zrh]` - W bias vectors for update, reset, and hidden gates
+  * `Rb[zrh]` - R bias vectors for update, reset, and hidden gates
+  * `WB[zrh]` - W parameter weight matrix for backward update, reset, and hidden gates
+  * `RB[zrh]` - R recurrence weight matrix for backward update, reset, and hidden gates
+  * `WBb[zrh]` - W bias vectors for backward update, reset, and hidden gates
+  * `RBb[zrh]` - R bias vectors for backward update, reset, and hidden gates
+  * `H` - Hidden state
+  * `num_directions` - 2 if direction == bidirectional else 1
+
+  Activation functions:
+
+  * Relu(x)                - max(0, x)
+  * Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
+  * Sigmoid(x)             - 1/(1 + e^{-x})
+
+  NOTE:
+    Below are optional
+
+  * Affine(x)              - alpha * x + beta
+  * LeakyRelu(x)           - x if x >= 0 else alpha * x
+  * ThresholdedRelu(x)     - x if x >= alpha else 0
+  * ScaledTanh(x)          - alpha * Tanh(beta * x)
+  * HardSigmoid(x)         - min(max(alpha * x + beta, 0), 1)
+  * Elu(x)                 - x if x >= 0 else alpha * (e^x - 1)
+  * Softsign(x)            - x/(1 + |x|)
+  * Softplus(x)            - log(1 + e^x)
+
+  Equations (Default: f=Sigmoid, g=Tanh):
+
+  * zt = f(Xt*(Wz^T) + Ht-1*(Rz^T) + Wbz + Rbz)
+  * rt = f(Xt*(Wr^T) + Ht-1*(Rr^T) + Wbr + Rbr)
+  * ht = g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh) # default, when linear_before_reset = 0
+  * ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh) # when linear_before_reset != 0
+  * Ht = (1 - zt) (.) ht + zt (.) Ht-1
+  This operator has **optional** inputs/outputs. See [the doc](IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>activation_alpha</tt> : list of floats</dt>
+<dd>Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.For example with LeakyRelu, the default alpha is 0.01.</dd>
+<dt><tt>activation_beta</tt> : list of floats</dt>
+<dd>Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.</dd>
+<dt><tt>activations</tt> : list of strings</dt>
+<dd>A list of 2 (or 4 if bidirectional) activation functions for update, reset, and hidden gates. The activation functions must be one of the activation functions specified above. Optional: See the equations for default if not specified.</dd>
+<dt><tt>clip</tt> : float</dt>
+<dd>Cell clip threshold. Clipping bounds the elements of a tensor in the range of [-threshold, +threshold] and is applied to the input of activations. No clip if not specified.</dd>
+<dt><tt>direction</tt> : string (default is forward)</dt>
+<dd>Specify if the RNN is forward, reverse, or bidirectional. Must be one of forward (default), reverse, or bidirectional.</dd>
+<dt><tt>hidden_size</tt> : int</dt>
+<dd>Number of neurons in the hidden layer</dd>
+<dt><tt>layout</tt> : int (default is 0)</dt>
+<dd>The shape format of inputs X, initial_h and outputs Y, Y_h. If 0, the following shapes are expected: X.shape = [seq_length, batch_size, input_size], Y.shape = [seq_length, num_directions, batch_size, hidden_size], initial_h.shape = Y_h.shape = [num_directions, batch_size, hidden_size]. If 1, the following shapes are expected: X.shape = [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length, num_directions, hidden_size], initial_h.shape = Y_h.shape = [batch_size, num_directions, hidden_size].</dd>
+<dt><tt>linear_before_reset</tt> : int (default is 0)</dt>
+<dd>When computing the output of the hidden gate, apply the linear transformation before multiplying by the output of the reset gate.</dd>
+</dl>
+
+#### Inputs (3 - 6)
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>The input sequences packed (and potentially padded) into one 3-D tensor with the shape of `[seq_length, batch_size, input_size]`.</dd>
+<dt><tt>W</tt> (differentiable) : T</dt>
+<dd>The weight tensor for the gates. Concatenation of `W[zrh]` and `WB[zrh]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 3*hidden_size, input_size]`.</dd>
+<dt><tt>R</tt> (differentiable) : T</dt>
+<dd>The recurrence weight tensor. Concatenation of `R[zrh]` and `RB[zrh]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 3*hidden_size, hidden_size]`.</dd>
+<dt><tt>B</tt> (optional, differentiable) : T</dt>
+<dd>The bias tensor for the gates. Concatenation of `[Wb[zrh], Rb[zrh]]` and `[WBb[zrh], RBb[zrh]]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 6*hidden_size]`. Optional: If not specified - assumed to be 0</dd>
+<dt><tt>sequence_lens</tt> (optional, non-differentiable) : T1</dt>
+<dd>Optional tensor specifying lengths of the sequences in a batch. If not specified - assumed all sequences in the batch to have length `seq_length`. It has shape `[batch_size]`.</dd>
+<dt><tt>initial_h</tt> (optional, non-differentiable) : T</dt>
+<dd>Optional initial value of the hidden. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+</dl>
+
+#### Outputs (0 - 2)
+
+<dl>
+<dt><tt>Y</tt> (optional, differentiable) : T</dt>
+<dd>A tensor that concats all the intermediate output values of the hidden. It has shape `[seq_length, num_directions, batch_size, hidden_size]`. </dd>
+<dt><tt>Y_h</tt> (optional, differentiable) : T</dt>
+<dd>The last output value of the hidden. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+<dt><tt>T1</tt> : tensor(int32)</dt>
+<dd>Constrain seq_lens to integer tensor.</dd>
+</dl>
+
+### <a name="GlobalAveragePool-22"></a>**GlobalAveragePool-22**</a>
+
+  GlobalAveragePool consumes an input tensor X and applies average pooling across
+   the values in the same channel. This is equivalent to AveragePool with kernel size
+   equal to the spatial dimension of input tensor.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input. The first two dimensions of output shape are the same as the input (N x C), while the other dimensions are all 1.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="GlobalLpPool-22"></a>**GlobalLpPool-22**</a>
+
+  GlobalLpPool consumes an input tensor X and applies lp pool pooling across
+   the values in the same channel. This is equivalent to LpPool with kernel size
+   equal to the spatial dimension of input tensor.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>p</tt> : int (default is 2)</dt>
+<dd>p value of the Lp norm used to pool over the input data.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input. The first two dimensions of output shape are the same as the input (N x C), while the other dimensions are all 1.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="GlobalMaxPool-22"></a>**GlobalMaxPool-22**</a>
+
+  GlobalMaxPool consumes an input tensor X and applies max pooling across
+   the values in the same channel. This is equivalent to MaxPool with kernel size
+   equal to the spatial dimension of input tensor.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input. The first two dimensions of output shape are the same as the input (N x C), while the other dimensions are all 1.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="GridSample-22"></a>**GridSample-22**</a>
+
+  Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from the `grid`.
+  For spatial input `X` with shape (N, C, H, W), the `grid` will have shape (N, H_out, W_out, 2),
+  the output `Y` will have shape (N, C, H_out, W_out). For volumetric input `X` with shape (N, C, D, H, W),
+  the `grid` will have shape (N, D_out, H_out, W_out, 3), the output `Y` will have shape (N, C, D_out, H_out, W_out).
+  More generally, for an input `X` of rank r+2 with shape (N, C, d1, d2, ..., dr),
+  the `grid` will have shape (N, D1_out, D2_out, ..., Dr_out, r), the output `Y` will have shape (N, C, D1_out, D2_out, ..., Dr_out).
+
+  The tensor `X` contains values at centers of square pixels (voxels, etc) locations such as (n, c, d1_in, d2_in, ..., dr_in).
+  The (n, d1_out, d2_out, ..., dr_out, :) values from the tensor `grid` are the normalized positions for interpolating the values
+  at the (n, c, d1_out, d2_out, ..., dr_out) locations from the output tensor `Y` using a specified interpolation method (the mode)
+  and a padding mode (for `grid` positions falling outside the 2-dimensional image).
+
+  For example, the values in `grid[n, h_out, w_out, :]` are size-2 vectors specifying normalized positions in the 2-dimensional space of `X`.
+  They are used to interpolate output values of `Y[n, c, h_out, w_out]`.
+
+  The GridSample operator is often used in doing grid generator and sampler in the
+  [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
+  See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html).
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>align_corners</tt> : int (default is 0)</dt>
+<dd>If align_corners=1, the extrema (-1 and 1) are considered as referring to the center points of the input's corner pixels (voxels, etc.). If align_corners=0, they are instead considered as referring to the corner points of the input's corner pixels (voxels, etc.), making the sampling more resolution agnostic.</dd>
+<dt><tt>mode</tt> : string (default is linear)</dt>
+<dd>Three interpolation modes: linear (default), nearest and cubic. The "linear" mode includes linear and N-linear interpolation modes depending on the number of spatial dimensions of the input tensor (i.e. linear for 1 spatial dimension, bilinear for 2 spatial dimensions, etc.). The "cubic" mode also includes N-cubic interpolation modes following the same rules. The "nearest" mode rounds to the nearest even index when the sampling point falls halfway between two indices.</dd>
+<dt><tt>padding_mode</tt> : string (default is zeros)</dt>
+<dd>Support padding modes for outside grid values: `zeros`(default), `border`, `reflection`. zeros: use 0 for out-of-bound grid locations, border: use border values for out-of-bound grid locations, reflection: use values at locations reflected by the border for out-of-bound grid locations. If index 0 represents the margin pixel, the reflected value at index -1 will be the same as the value at index 1. For location far away from the border, it will keep being reflected until becoming in bound. If pixel location x = -3.5 reflects by border -1 and becomes x' = 1.5, then reflects by border 1 and becomes x'' = 0.5.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T1</dt>
+<dd>Input tensor of rank r+2 that has shape (N, C, D1, D2, ..., Dr), where N is the batch size, C is the number of channels, D1, D2, ..., Dr are the spatial dimensions.</dd>
+<dt><tt>grid</tt> (non-differentiable) : T2</dt>
+<dd>Input offset of shape (N, D1_out, D2_out, ..., Dr_out, r), where D1_out, D2_out, ..., Dr_out are the spatial dimensions of the grid and output, and r is the number of spatial dimensions. Grid specifies the sampling locations normalized by the input spatial dimensions. Therefore, it should have most values in the range of [-1, 1]. If the grid has values outside the range of [-1, 1], the corresponding outputs will be handled as defined by padding_mode. Following computer vision convention, the coordinates in the length-r location vector are listed from the innermost tensor dimension to the outermost, the opposite of regular tensor indexing.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T1</dt>
+<dd>Output tensor of rank r+2 that has shape (N, C, D1_out, D2_out, ..., Dr_out) of the sampled values. For integer input types, intermediate values are computed as floating point and cast to integer at the end.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Constrain input `X` and output `Y` types to all tensor types.</dd>
+<dt><tt>T2</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain grid types to float tensors.</dd>
+</dl>
+
+### <a name="HardSigmoid-22"></a>**HardSigmoid-22**</a>
+
+  HardSigmoid takes one input data (Tensor<T>) and produces one output data
+  (Tensor<T>) where the HardSigmoid function, y = max(0, min(1, alpha * x + beta)),
+  is applied to the tensor elementwise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>alpha</tt> : float (default is 0.2)</dt>
+<dd>Value of alpha.</dd>
+<dt><tt>beta</tt> : float (default is 0.5)</dt>
+<dd>Value of beta.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="HardSwish-22"></a>**HardSwish-22**</a>
+
+  HardSwish takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where
+  the HardSwish function, y = x * max(0, min(1, alpha * x + beta)) = x * HardSigmoid<alpha, beta>(x),
+  where alpha = 1/6 and beta = 0.5, is applied to the tensor elementwise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="InstanceNormalization-22"></a>**InstanceNormalization-22**</a>
+
+  Carries out instance normalization as described in the paper
+  https://arxiv.org/abs/1607.08022.
+
+  y = scale * (x - mean) / sqrt(variance + epsilon) + B,
+  where mean and variance are computed per instance per channel.
+
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>epsilon</tt> : float (default is 1e-05)</dt>
+<dd>The epsilon value to use to avoid division by zero.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
+<dt><tt>scale</tt> (differentiable) : T</dt>
+<dd>The input 1-dimensional scale tensor of size C.</dd>
+<dt><tt>B</tt> (differentiable) : T</dt>
+<dd>The input 1-dimensional bias tensor of size C.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The output tensor of the same shape as input.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="LSTM-22"></a>**LSTM-22**</a>
+
+  Computes an one-layer LSTM. This operator is usually supported via some
+  custom implementation such as CuDNN.
+
+  Notations:
+
+  * `X` - input tensor
+  * `i` - input gate
+  * `o` - output gate
+  * `f` - forget gate
+  * `c` - cell gate
+  * `t` - time step (t-1 means previous time step)
+  * `W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates
+  * `R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates
+  * `Wb[iofc]` - W bias vectors for input, output, forget, and cell gates
+  * `Rb[iofc]` - R bias vectors for input, output, forget, and cell gates
+  * `P[iof]`  - P peephole weight vector for input, output, and forget gates
+  * `WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates
+  * `RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates
+  * `WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates
+  * `RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates
+  * `PB[iof]`  - P peephole weight vector for backward input, output, and forget gates
+  * `H` - Hidden state
+  * `num_directions` - 2 if direction == bidirectional else 1
+
+  Activation functions:
+
+  * Relu(x)                - max(0, x)
+  * Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
+  * Sigmoid(x)             - 1/(1 + e^{-x})
+
+  NOTE: Below are optional
+
+  * Affine(x)              - alpha*x + beta
+  * LeakyRelu(x)           - x if x >= 0 else alpha * x
+  * ThresholdedRelu(x)     - x if x >= alpha else 0
+  * ScaledTanh(x)          - alpha*Tanh(beta*x)
+  * HardSigmoid(x)         - min(max(alpha*x + beta, 0), 1)
+  * Elu(x)                 - x if x >= 0 else alpha*(e^x - 1)
+  * Softsign(x)            - x/(1 + |x|)
+  * Softplus(x)            - log(1 + e^x)
+
+  Equations (Default: f=Sigmoid, g=Tanh, h=Tanh):
+
+  * it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
+  * ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
+  * ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
+  * Ct = ft (.) Ct-1 + it (.) ct
+  * ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
+  * Ht = ot (.) h(Ct)
+  This operator has **optional** inputs/outputs. See [the doc](IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>activation_alpha</tt> : list of floats</dt>
+<dd>Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.For example with LeakyRelu, the default alpha is 0.01.</dd>
+<dt><tt>activation_beta</tt> : list of floats</dt>
+<dd>Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.</dd>
+<dt><tt>activations</tt> : list of strings</dt>
+<dd>A list of 3 (or 6 if bidirectional) activation functions for input, output, forget, cell, and hidden. The activation functions must be one of the activation functions specified above. Optional: See the equations for default if not specified.</dd>
+<dt><tt>clip</tt> : float</dt>
+<dd>Cell clip threshold. Clipping bounds the elements of a tensor in the range of [-threshold, +threshold] and is applied to the input of activations. No clip if not specified.</dd>
+<dt><tt>direction</tt> : string (default is forward)</dt>
+<dd>Specify if the RNN is forward, reverse, or bidirectional. Must be one of forward (default), reverse, or bidirectional.</dd>
+<dt><tt>hidden_size</tt> : int</dt>
+<dd>Number of neurons in the hidden layer</dd>
+<dt><tt>input_forget</tt> : int (default is 0)</dt>
+<dd>Couple the input and forget gates if 1.</dd>
+<dt><tt>layout</tt> : int (default is 0)</dt>
+<dd>The shape format of inputs X, initial_h, initial_c and outputs Y, Y_h, Y_c. If 0, the following shapes are expected: X.shape = [seq_length, batch_size, input_size], Y.shape = [seq_length, num_directions, batch_size, hidden_size], initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [num_directions, batch_size, hidden_size]. If 1, the following shapes are expected: X.shape = [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length, num_directions, hidden_size], initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [batch_size, num_directions, hidden_size].</dd>
+</dl>
+
+#### Inputs (3 - 8)
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>The input sequences packed (and potentially padded) into one 3-D tensor with the shape of `[seq_length, batch_size, input_size]`.</dd>
+<dt><tt>W</tt> (differentiable) : T</dt>
+<dd>The weight tensor for the gates. Concatenation of `W[iofc]` and `WB[iofc]` (if bidirectional) along dimension 0. The tensor has shape `[num_directions, 4*hidden_size, input_size]`.</dd>
+<dt><tt>R</tt> (differentiable) : T</dt>
+<dd>The recurrence weight tensor. Concatenation of `R[iofc]` and `RB[iofc]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 4*hidden_size, hidden_size]`.</dd>
+<dt><tt>B</tt> (optional, differentiable) : T</dt>
+<dd>The bias tensor for input gate. Concatenation of `[Wb[iofc], Rb[iofc]]`, and `[WBb[iofc], RBb[iofc]]` (if bidirectional) along dimension 0. This tensor has shape `[num_directions, 8*hidden_size]`. Optional: If not specified - assumed to be 0.</dd>
+<dt><tt>sequence_lens</tt> (optional, non-differentiable) : T1</dt>
+<dd>Optional tensor specifying lengths of the sequences in a batch. If not specified - assumed all sequences in the batch to have length `seq_length`. It has shape `[batch_size]`.</dd>
+<dt><tt>initial_h</tt> (optional, non-differentiable) : T</dt>
+<dd>Optional initial value of the hidden. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+<dt><tt>initial_c</tt> (optional, non-differentiable) : T</dt>
+<dd>Optional initial value of the cell. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+<dt><tt>P</tt> (optional, differentiable) : T</dt>
+<dd>The weight tensor for peepholes. Concatenation of `P[iof]` and `PB[iof]` (if bidirectional) along dimension 0. It has shape `[num_directions, 3*hidde_size]`. Optional: If not specified - assumed to be 0.</dd>
+</dl>
+
+#### Outputs (0 - 3)
+
+<dl>
+<dt><tt>Y</tt> (optional, differentiable) : T</dt>
+<dd>A tensor that concats all the intermediate output values of the hidden. It has shape `[seq_length, num_directions, batch_size, hidden_size]`. </dd>
+<dt><tt>Y_h</tt> (optional, differentiable) : T</dt>
+<dd>The last output value of the hidden. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+<dt><tt>Y_c</tt> (optional, differentiable) : T</dt>
+<dd>The last output value of the cell. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+<dt><tt>T1</tt> : tensor(int32)</dt>
+<dd>Constrain seq_lens to integer tensor.</dd>
+</dl>
+
+### <a name="LpNormalization-22"></a>**LpNormalization-22**</a>
+
+  Given a matrix, apply Lp-normalization along the provided axis.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>axis</tt> : int (default is -1)</dt>
+<dd>The axis on which to apply normalization, -1 mean last axis.</dd>
+<dt><tt>p</tt> : int (default is 2)</dt>
+<dd>The order of the normalization, only 1 or 2 are supported.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input matrix</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>Matrix after normalization</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="LpPool-22"></a>**LpPool-22**</a>
+
+  LpPool consumes an input tensor X and applies Lp pooling across
+   the tensor according to kernel sizes, stride sizes, and pad lengths.
+   Lp pooling consisting of computing the Lp norm on all values of a subset
+   of the input tensor according to the kernel size and downsampling the
+   data into the output tensor Y for further processing. The output spatial shape will be following:
+   ```
+   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
+   ```
+   or
+   ```
+   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
+   ```
+   if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
+
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+   ```
+   VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
+   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+   ```
+   And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+   ```
+   pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + {kernelSpatialShape} - input_spatial_shape[i]
+   ```
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>auto_pad</tt> : string (default is NOTSET)</dt>
+<dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.</dd>
+<dt><tt>ceil_mode</tt> : int (default is 0)</dt>
+<dd>Whether to use ceil or floor (default) to compute the output shape.</dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd>dilation value along each spatial axis of the filter. If not present, the dilation defaults is 1 along each spatial axis.</dd>
+<dt><tt>kernel_shape</tt> : list of ints (required)</dt>
+<dd>The size of the kernel along each axis.</dd>
+<dt><tt>p</tt> : int (default is 2)</dt>
+<dd>p value of the Lp norm used to pool over the input data.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor from Lp pooling across the input tensor. Dimensions will vary based on various kernel, stride, and pad sizes.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="MaxPool-22"></a>**MaxPool-22**</a>
+
+  MaxPool consumes an input tensor X and applies max pooling across
+   the tensor according to kernel sizes, stride sizes, and pad lengths.
+   max pooling consisting of computing the max on all values of a
+   subset of the input tensor according to the kernel size and downsampling the
+   data into the output tensor Y for further processing. The output spatial shape is calculated differently
+   depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
+   With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+   ```
+   output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   ```
+   or
+   ```
+   output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+   ```
+   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+
+   `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+   ```
+   VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
+   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+   ```
+   or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
+   ```
+   VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
+   SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
+   ```
+   And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+   ```
+   pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
+   ```
+   The output of each pooling window is maximum number of elements exclude pad.
+
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>auto_pad</tt> : string (default is NOTSET)</dt>
+<dd>auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where default value is NOTSET, which means explicit padding is used. SAME_UPPER or SAME_LOWER mean pad the input so that `output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. The padding is split between the two sides equally or almost equally (depending on whether it is even or odd). In case the padding is an odd number, the extra padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.</dd>
+<dt><tt>ceil_mode</tt> : int (default is 0)</dt>
+<dd>Whether to use ceil or floor (default) to compute the output shape.</dd>
+<dt><tt>dilations</tt> : list of ints</dt>
+<dd>Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.</dd>
+<dt><tt>kernel_shape</tt> : list of ints (required)</dt>
+<dd>The size of the kernel along each axis.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
+<dt><tt>storage_order</tt> : int (default is 0)</dt>
+<dd>The storage order of the tensor. 0 is row major, and 1 is column major. This attribute is used only to convert an n-tuple index value into a single integer value for producing the second output. </dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size. Optionally, if dimension denotation is in effect, the operation expects the input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].</dd>
+</dl>
+
+#### Outputs (1 - 2)
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output data tensor from average or max pooling across the input tensor. Dimensions will vary based on various kernel, stride, and pad sizes. Floor value of the dimension is used</dd>
+<dt><tt>Indices</tt> (optional, non-differentiable) : I</dt>
+<dd>Indices tensor from max pooling across the input tensor. The dimensions of indices are the same as output tensor. The values in indices of are the indices of the selected values during pooling. The indices are computed as flatten 1-D tensor, and the indices do not consider padding. So the values in indices are in [0, N x C x D1 x ... x Dn).</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(uint8)</dt>
+<dd>Constrain input and output types to float and 8 bit tensors.</dd>
+<dt><tt>I</tt> : tensor(int64)</dt>
+<dd>Constrain index tensor to int64</dd>
+</dl>
+
+### <a name="MaxRoiPool-22"></a>**MaxRoiPool-22**</a>
+
+  ROI max pool consumes an input tensor X and region of interests (RoIs) to
+   apply max pooling across each RoI, to produce output 4-D tensor of shape
+   (num_rois, channels, pooled_shape[0], pooled_shape[1]).
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>pooled_shape</tt> : list of ints (required)</dt>
+<dd>ROI pool output shape (height, width).</dd>
+<dt><tt>spatial_scale</tt> : float (default is 1.0)</dt>
+<dd>Multiplicative spatial scale factor to translate ROI coordinates from their input scale to the scale used when pooling.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input data tensor from the previous operator; dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data.</dd>
+<dt><tt>rois</tt> (non-differentiable) : T</dt>
+<dd>RoIs (Regions of Interest) to pool over. Should be a 2-D tensor of shape (num_rois, 5) given as [[batch_id, x1, y1, x2, y2], ...].</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>RoI pooled output 4-D tensor of shape (num_rois, channels, pooled_shape[0], pooled_shape[1]).</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="MaxUnpool-22"></a>**MaxUnpool-22**</a>
+
+  MaxUnpool essentially computes the partial inverse of the MaxPool op.
+   The input information to this op is typically the output information from a MaxPool op. The first
+   input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
+   from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
+   to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
+   The third (optional) input is a tensor that specifies the output size of the unpooling operation.
+
+  MaxUnpool is intended to do 'partial' inverse of the MaxPool op. 'Partial' because all the non-maximal
+   values from the original input to MaxPool are set to zero in the output of the MaxUnpool op. Pooling
+   the result of an unpooling operation should give back the original input to the unpooling op.
+
+  MaxUnpool can produce the same output size for several input sizes, which makes unpooling op ambiguous.
+   The third input argument, output_size, is meant to disambiguate the op and produce output tensor of
+   known/predictable size.
+
+  In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
+   which define the exact unpooling op. The attributes typically have the same values as the corresponding
+   pooling op that the unpooling op is trying to invert.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>kernel_shape</tt> : list of ints (required)</dt>
+<dd>The size of the kernel along each axis.</dd>
+<dt><tt>pads</tt> : list of ints</dt>
+<dd>Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. The value represent the number of pixels added to the beginning and end part of the corresponding axis. `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, the number of pixels added at the end of axis `i`. This attribute cannot be used simultaneously with auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.</dd>
+<dt><tt>strides</tt> : list of ints</dt>
+<dd>Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.</dd>
+</dl>
+
+#### Inputs (2 - 3)
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T1</dt>
+<dd>Input data tensor that has to be unpooled. This tensor is typically the first output of the MaxPool op.Dimensions for image case are (N x C x H x W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data. For non-image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size. Optionally, if dimension denotation is in effect, the operation expects the input data tensor to arrive with the dimension denotation of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].</dd>
+<dt><tt>I</tt> (non-differentiable) : T2</dt>
+<dd>Input data tensor containing the indices corresponding to elements in the first input tensor X.This tensor is typically the second output of the MaxPool op.Dimensions must be the same as input tensor X. The indices are linear, i.e. computed considering the tensor as flattened 1-D tensor, assuming row-major storage. Also, the linear indices should not consider padding. So the values in indices are in the range [0, N x C x D1 x ... x Dn).</dd>
+<dt><tt>output_shape</tt> (optional, non-differentiable) : T2</dt>
+<dd>The shape of the output can be explicitly set which will cause pads values to be auto generated. If 'output_shape' is specified, 'pads' values are ignored.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T1</dt>
+<dd>Output data tensor that contains the result of the unpooling.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(int64)</dt>
+<dd>Constrain index tensor to int64</dd>
+</dl>
+
+### <a name="Mish-22"></a>**Mish-22**</a>
+
+  Mish: A Self Regularized Non-Monotonic Neural Activation Function.
+
+  Perform the linear unit element-wise on the input tensor X using formula:
+
+  ```
+  mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
+  ```
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input X and output types to float tensors.</dd>
+</dl>
+
+### <a name="Multinomial-22"></a>**Multinomial-22**</a>
+
+  Generate a tensor of samples from a multinomial distribution according to the probabilities
+  of each of the possible outcomes.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dtype</tt> : int (default is 6)</dt>
+<dd>(Optional) The data type for the elements of the output tensor, if not specified, we will use int32.</dd>
+<dt><tt>sample_size</tt> : int (default is 1)</dt>
+<dd>Number of times to sample.</dd>
+<dt><tt>seed</tt> : float</dt>
+<dd>(Optional) Seed to the random generator, if not specified we will auto generate one.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T1</dt>
+<dd>Input tensor with shape [batch_size, class_size], where class_size is the number of all possible outcomes. Each value along the axis zero represents the unnormalized log-probability of each corresponding outcome in a batch.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T2</dt>
+<dd>Output tensor with shape [batch_size, sample_size], where sample_size is the number of times to sample. Each value along the axis zero represents the outcome of the corresponding sample in a batch.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input types to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(int32), tensor(int64)</dt>
+<dd>Constrain output types to integral tensors.</dd>
+</dl>
+
+### <a name="NegativeLogLikelihoodLoss-22"></a>**NegativeLogLikelihoodLoss-22**</a>
+
+  A NegativeLogLikelihoodLoss operator computes (weighted) negative log likelihood loss.
+  Its "input" tensor has the shape of (N, C, d1, d2, ..., dk) where k >= 0.
+  The "input" tensor contains log-probabilities for input[n, :, d_1, d_2,..., d_k] being in a class of [0, C).
+  The operator's "target" input tensor has the shape of (N, d1, d2, ..., dk). It encodes class labels (one of C classes)
+  or it may contain a special value (indicated by an attribute ignore_index) for N x d1 x d2 x ... x dk samples.
+  The loss value for input[n, :, d_1, d_2,...d_k] being classified as class c = target[n][d_1][d_2]...[d_k] is computed as:
+
+  ```
+  loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k].
+  ```
+
+  When an optional "weight" is provided, the sample loss is calculated as:
+
+  ```
+  loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k] * weight[c].
+  ```
+
+  loss is zero for the case when target-value equals ignore_index.
+
+  ```
+  loss[n][d_1][d_2]...[d_k] = 0, when target[n][d_1][d_2]...[d_k] = ignore_index
+  ```
+
+  If "reduction" attribute is set to "none", the operator's output will be the above loss with shape (N, d1, d2, ..., dk).
+  If "reduction" attribute is set to "mean" (the default attribute value), the output loss is (weight) averaged:
+
+  ```
+  mean(loss), if "weight" is not provided,
+  ```
+
+  or if weight is provided,
+
+  ```
+  sum(loss) / sum(weight[target[n][d_1][d_2]...[d_k]]]), for all samples.
+  ```
+
+  If "reduction" attribute is set to "sum", the output is a scalar: `sum(loss)`.
+
+  See also https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
+
+  Example 1:
+
+  ```
+  // negative log likelihood loss, "none" reduction
+  N, C, d1 = 2, 3, 2
+  input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+            [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+  target = [[2, 1], [0, 2]]
+
+  loss = np.zeros((N, d1))
+  for n in range(N):
+      for d_1 in range(d1):
+          c = target[n][d_1]
+          loss[n][d_1] = -input[n][c][d_1]
+
+  // print(loss)
+  // [[-3. -2.]
+  //  [-0. -2.]]
+  ```
+
+  Example 2:
+
+  ```
+  // weighted negative log likelihood loss, sum reduction
+  N, C, d1 = 2, 3, 2
+  input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+          [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+  target = [[2, 1], [0, 2]]
+  weight = [0.2, 0.3, 0.1]
+  loss = np.zeros((N, d1))
+  for n in range(N):
+      for d_1 in range(d1):
+          c = target[n][d_1]
+          loss[n][d_1] = -input[n][c][d_1] * weight[c]
+
+  loss = np.sum(loss)
+  // print(loss)
+  // -1.1
+  ```
+
+  Example 3:
+
+  ```
+  // weighted negative log likelihood loss, mean reduction
+  N, C, d1 = 2, 3, 2
+  input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+          [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+  target = [[2, 1], [0, 2]]
+  weight = [0.2, 0.3, 0.1]
+  loss = np.zeros((N, d1))
+  weight_total = 0
+  for n in range(N):
+      for d_1 in range(d1):
+          c = target[n][d_1]
+          loss[n][d_1] = -input[n][c][d_1] * weight[c]
+          weight_total = weight_total + weight[c]
+
+  loss = np.sum(loss) / weight_total
+  // print(loss)
+  // -1.57
+  ```
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>ignore_index</tt> : int</dt>
+<dd>Specifies a target value that is ignored and does not contribute to the input gradient. It's an optional value.</dd>
+<dt><tt>reduction</tt> : string (default is mean)</dt>
+<dd>Type of reduction to apply to loss: none, sum, mean (default). 'none': the output is the loss for each sample. 'sum': the output will be summed. 'mean': the sum of the output will be divided by the sum of applied weights.</dd>
+</dl>
+
+#### Inputs (2 - 3)
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor of shape (N, C) or (N, C, d1, d2, ..., dk).</dd>
+<dt><tt>target</tt> (non-differentiable) : Tind</dt>
+<dd>Target tensor of shape (N) or (N, d1, d2, ..., dk). Target element value shall be in range of [0, C). If ignore_index is specified, it may have a value outside [0, C) and the target values should either be in the range [0, C) or have the value ignore_index.</dd>
+<dt><tt>weight</tt> (optional, non-differentiable) : T</dt>
+<dd>Optional rescaling weight tensor. If given, it has to be a tensor of size C. Otherwise, it is treated as if having all ones.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>loss</tt> (differentiable) : T</dt>
+<dd>The negative log likelihood loss</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input, weight, and output types to floating-point tensors.</dd>
+<dt><tt>Tind</tt> : tensor(int32), tensor(int64)</dt>
+<dd>Constrain target to integer types</dd>
+</dl>
+
+### <a name="RNN-22"></a>**RNN-22**</a>
+
+  Computes an one-layer simple RNN. This operator is usually supported
+  via some custom implementation such as CuDNN.
+
+  Notations:
+
+  * `X` - input tensor
+  * `i` - input gate
+  * `t` - time step (t-1 means previous time step)
+  * `Wi` - W parameter weight matrix for input gate
+  * `Ri` - R recurrence weight matrix for input gate
+  * `Wbi` - W parameter bias vector for input gate
+  * `Rbi` - R parameter bias vector for input gate
+  * `WBi` - W parameter weight matrix for backward input gate
+  * `RBi` - R recurrence weight matrix for backward input gate
+  * `WBbi` - WR bias vectors for backward input gate
+  * `RBbi` - RR bias vectors for backward input gate
+  * `H` - Hidden state
+  * `num_directions` - 2 if direction == bidirectional else 1
+
+  Activation functions:
+
+  * Relu(x)                - max(0, x)
+  * Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
+  * Sigmoid(x)             - 1/(1 + e^{-x})
+
+  NOTE: Below are optional
+
+  * Affine(x)              - alpha*x + beta
+  * LeakyRelu(x)           - x if x >= 0 else alpha * x
+  * ThresholdedRelu(x)     - x if x >= alpha else 0
+  * ScaledTanh(x)          - alpha*Tanh(beta*x)
+  * HardSigmoid(x)         - min(max(alpha*x + beta, 0), 1)
+  * Elu(x)                 - x if x >= 0 else alpha*(e^x - 1)
+  * Softsign(x)            - x/(1 + |x|)
+  * Softplus(x)            - log(1 + e^x)
+
+  Equations (Default: f=Tanh):
+
+  * Ht = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Wbi + Rbi)
+  This operator has **optional** inputs/outputs. See [the doc](IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>activation_alpha</tt> : list of floats</dt>
+<dd>Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.For example with LeakyRelu, the default alpha is 0.01.</dd>
+<dt><tt>activation_beta</tt> : list of floats</dt>
+<dd>Optional scaling values used by some activation functions. The values are consumed in the order of activation functions, for example (f, g, h) in LSTM. Default values are the same as of corresponding ONNX operators.</dd>
+<dt><tt>activations</tt> : list of strings (default is ['Tanh', 'Tanh'])</dt>
+<dd>One (or two if bidirectional) activation function for input gate. The activation function must be one of the activation functions specified above. Optional: Default `Tanh` if not specified.</dd>
+<dt><tt>clip</tt> : float</dt>
+<dd>Cell clip threshold. Clipping bounds the elements of a tensor in the range of [-threshold, +threshold] and is applied to the input of activations. No clip if not specified.</dd>
+<dt><tt>direction</tt> : string (default is forward)</dt>
+<dd>Specify if the RNN is forward, reverse, or bidirectional. Must be one of forward (default), reverse, or bidirectional.</dd>
+<dt><tt>hidden_size</tt> : int</dt>
+<dd>Number of neurons in the hidden layer</dd>
+<dt><tt>layout</tt> : int (default is 0)</dt>
+<dd>The shape format of inputs X, initial_h and outputs Y, Y_h. If 0, the following shapes are expected: X.shape = [seq_length, batch_size, input_size], Y.shape = [seq_length, num_directions, batch_size, hidden_size], initial_h.shape = Y_h.shape = [num_directions, batch_size, hidden_size]. If 1, the following shapes are expected: X.shape = [batch_size, seq_length, input_size], Y.shape = [batch_size, seq_length, num_directions, hidden_size], initial_h.shape = Y_h.shape = [batch_size, num_directions, hidden_size].</dd>
+</dl>
+
+#### Inputs (3 - 6)
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>The input sequences packed (and potentially padded) into one 3-D tensor with the shape of `[seq_length, batch_size, input_size]`.</dd>
+<dt><tt>W</tt> (differentiable) : T</dt>
+<dd>The weight tensor for input gate. Concatenation of `Wi` and `WBi` (if bidirectional). The tensor has shape `[num_directions, hidden_size, input_size]`.</dd>
+<dt><tt>R</tt> (differentiable) : T</dt>
+<dd>The recurrence weight tensor. Concatenation of `Ri` and `RBi` (if bidirectional). The tensor has shape `[num_directions, hidden_size, hidden_size]`.</dd>
+<dt><tt>B</tt> (optional, differentiable) : T</dt>
+<dd>The bias tensor for input gate. Concatenation of `[Wbi, Rbi]` and `[WBbi, RBbi]` (if bidirectional). The tensor has shape `[num_directions, 2*hidden_size]`. Optional: If not specified - assumed to be 0.</dd>
+<dt><tt>sequence_lens</tt> (optional, non-differentiable) : T1</dt>
+<dd>Optional tensor specifying lengths of the sequences in a batch. If not specified - assumed all sequences in the batch to have length `seq_length`. It has shape `[batch_size]`.</dd>
+<dt><tt>initial_h</tt> (optional, non-differentiable) : T</dt>
+<dd>Optional initial value of the hidden. If not specified - assumed to be 0. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+</dl>
+
+#### Outputs (0 - 2)
+
+<dl>
+<dt><tt>Y</tt> (optional, differentiable) : T</dt>
+<dd>A tensor that concats all the intermediate output values of the hidden. It has shape `[seq_length, num_directions, batch_size, hidden_size]`. </dd>
+<dt><tt>Y_h</tt> (optional, differentiable) : T</dt>
+<dd>The last output value of the hidden. It has shape `[num_directions, batch_size, hidden_size]`.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+<dt><tt>T1</tt> : tensor(int32)</dt>
+<dd>Constrain seq_lens to integer tensor.</dd>
+</dl>
+
+### <a name="RandomNormal-22"></a>**RandomNormal-22**</a>
+
+  Generate a tensor with random values drawn from a normal distribution. The shape
+  of the tensor is specified by the `shape` argument and the parameter of the normal distribution
+  specified by `mean` and `scale`.
+
+  The data type is specified by the 'dtype' argument. The 'dtype' argument must
+  be one of the data types specified in the 'DataType' enum field in the
+  TensorProto message.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dtype</tt> : int (default is 1)</dt>
+<dd>The data type for the elements of the output tensor. Default is TensorProto::FLOAT.</dd>
+<dt><tt>mean</tt> : float (default is 0.0)</dt>
+<dd>The mean of the normal distribution.</dd>
+<dt><tt>scale</tt> : float (default is 1.0)</dt>
+<dd>The standard deviation of the normal distribution.</dd>
+<dt><tt>seed</tt> : float</dt>
+<dd>(Optional) Seed to the random generator, if not specified we will auto generate one.</dd>
+<dt><tt>shape</tt> : list of ints (required)</dt>
+<dd>The shape of the output tensor.</dd>
+</dl>
+
+#### Inputs
+
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>Output tensor of random values drawn from normal distribution</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain output types to float tensors.</dd>
+</dl>
+
+### <a name="RandomNormalLike-22"></a>**RandomNormalLike-22**</a>
+
+  Generate a tensor with random values drawn from a normal distribution.
+  The shape of the output tensor is copied from the shape of the input tensor,
+  and the parameters of the normal distribution are specified by `mean` and `scale`.
+
+  The data type is specified by the 'dtype' argument, or copied from the input tensor if not provided.
+  The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
+  TensorProto message, and be valid as an output type.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dtype</tt> : int</dt>
+<dd>(Optional) The data type for the elements of the output tensor, if not specified, we will use the data type of the input tensor.</dd>
+<dt><tt>mean</tt> : float (default is 0.0)</dt>
+<dd>The mean of the normal distribution.</dd>
+<dt><tt>scale</tt> : float (default is 1.0)</dt>
+<dd>The standard deviation of the normal distribution.</dd>
+<dt><tt>seed</tt> : float</dt>
+<dd>(Optional) Seed to the random generator, if not specified we will auto generate one.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T1</dt>
+<dd>Input tensor to copy shape and optionally type information from.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T2</dt>
+<dd>Output tensor of random values drawn from normal distribution</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.</dd>
+<dt><tt>T2</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain output types to float tensors.</dd>
+</dl>
+
+### <a name="RandomUniform-22"></a>**RandomUniform-22**</a>
+
+  Generate a tensor with random values drawn from a uniform distribution. The shape
+  of the tensor is specified by the `shape` argument and the range by `low` and `high`.
+
+  The data type is specified by the 'dtype' argument. The 'dtype' argument must
+  be one of the data types specified in the 'DataType' enum field in the
+  TensorProto message.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dtype</tt> : int (default is 1)</dt>
+<dd>The data type for the elements of the output tensor. If not specified, default is TensorProto::FLOAT.</dd>
+<dt><tt>high</tt> : float (default is 1.0)</dt>
+<dd>Upper boundary of the output values.</dd>
+<dt><tt>low</tt> : float (default is 0.0)</dt>
+<dd>Lower boundary of the output values.</dd>
+<dt><tt>seed</tt> : float</dt>
+<dd>(Optional) Seed to the random generator, if not specified we will auto generate one.</dd>
+<dt><tt>shape</tt> : list of ints (required)</dt>
+<dd>The shape of the output tensor.</dd>
+</dl>
+
+#### Inputs
+
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>Output tensor of random values drawn from uniform distribution</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain output types to float tensors.</dd>
+</dl>
+
+### <a name="RandomUniformLike-22"></a>**RandomUniformLike-22**</a>
+
+  Generate a tensor with random values drawn from a uniform distribution.
+  The shape of the output tensor is copied from the shape of the input tensor,
+  and the parameters of the uniform distribution are specified by `low` and `high`.
+
+  The data type is specified by the 'dtype' argument, or copied from the input tensor if not provided.
+  The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
+  TensorProto message and be valid as an output type.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>dtype</tt> : int</dt>
+<dd>(Optional) The data type for the elements of the output tensor, if not specified, we will use the data type of the input tensor.</dd>
+<dt><tt>high</tt> : float (default is 1.0)</dt>
+<dd>Upper boundary of the output values.</dd>
+<dt><tt>low</tt> : float (default is 0.0)</dt>
+<dd>Lower boundary of the output values.</dd>
+<dt><tt>seed</tt> : float</dt>
+<dd>(Optional) Seed to the random generator, if not specified we will auto generate one.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T1</dt>
+<dd>Input tensor to copy shape and optionally type information from.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T2</dt>
+<dd>Output tensor of random values drawn from uniform distribution</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.</dd>
+<dt><tt>T2</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain output types to float tensors.</dd>
+</dl>
+
+### <a name="RoiAlign-22"></a>**RoiAlign-22**</a>
+
+  Region of Interest (RoI) align operation described in the
+  [Mask R-CNN paper](https://arxiv.org/abs/1703.06870).
+  RoiAlign consumes an input tensor X and region of interests (rois)
+  to apply pooling across each RoI; it produces a 4-D tensor of shape
+  (num_rois, C, output_height, output_width).
+
+  RoiAlign is proposed to avoid the misalignment by removing
+  quantizations while converting from original image into feature
+  map and from feature map into RoI feature; in each ROI bin,
+  the value of the sampled locations are computed directly
+  through bilinear interpolation.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>coordinate_transformation_mode</tt> : string (default is half_pixel)</dt>
+<dd>Allowed values are 'half_pixel' and 'output_half_pixel'. Use the value 'half_pixel' to pixel shift the input coordinates by -0.5 (the recommended behavior). Use the value 'output_half_pixel' to omit the pixel shift for the input (use this for a backward-compatible behavior).</dd>
+<dt><tt>mode</tt> : string (default is avg)</dt>
+<dd>The pooling method. Two modes are supported: 'avg' and 'max'. Default is 'avg'.</dd>
+<dt><tt>output_height</tt> : int (default is 1)</dt>
+<dd>default 1; Pooled output Y's height.</dd>
+<dt><tt>output_width</tt> : int (default is 1)</dt>
+<dd>default 1; Pooled output Y's width.</dd>
+<dt><tt>sampling_ratio</tt> : int (default is 0)</dt>
+<dd>Number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0, then exactly sampling_ratio x sampling_ratio grid points are used. If == 0, then an adaptive number of grid points are used (computed as ceil(roi_width / output_width), and likewise for height). Default is 0.</dd>
+<dt><tt>spatial_scale</tt> : float (default is 1.0)</dt>
+<dd>Multiplicative spatial scale factor to translate ROI coordinates from their input spatial scale to the scale used when pooling, i.e., spatial scale of the input feature map X relative to the input image. E.g.; default is 1.0f. </dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> : T1</dt>
+<dd>Input data tensor from the previous operator; 4-D feature map of shape (N, C, H, W), where N is the batch size, C is the number of channels, and H and W are the height and the width of the data.</dd>
+<dt><tt>rois</tt> : T1</dt>
+<dd>RoIs (Regions of Interest) to pool over; rois is 2-D input of shape (num_rois, 4) given as [[x1, y1, x2, y2], ...]. The RoIs' coordinates are in the coordinate system of the input image. Each coordinate set has a 1:1 correspondence with the 'batch_indices' input.</dd>
+<dt><tt>batch_indices</tt> : T2</dt>
+<dd>1-D tensor of shape (num_rois,) with each element denoting the index of the corresponding image in the batch.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T1</dt>
+<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element Y[r-1] is a pooled feature map corresponding to the r-th RoI X[r-1].</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain types to float tensors.</dd>
+<dt><tt>T2</tt> : tensor(int64)</dt>
+<dd>Constrain types to int tensors.</dd>
+</dl>
+
+### <a name="Round-22"></a>**Round-22**</a>
+
+  Round takes one input Tensor and rounds the values, element-wise, meaning
+  it finds the nearest integer for each value.
+  In case of halves, the rule is to round them to the nearest even integer.
+  If input x is integral, +0, -0, NaN,  or infinite, x itself is returned.
+  The output tensor has the same shape and type as the input.
+
+  Examples:
+  ```
+  round([0.9]) = [1.0]
+  round([2.5]) = [2.0]
+  round([2.3]) = [2.0]
+  round([1.5]) = [2.0]
+  round([-4.5]) = [-4.0]
+  ```
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (non-differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (non-differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Selu-22"></a>**Selu-22**</a>
+
+  Selu takes one input data (Tensor<T>) and produces one output data
+  (Tensor<T>) where the scaled exponential linear unit function,
+  `y = gamma * (alpha * e^x - alpha) for x <= 0`, `y = gamma * x for x > 0`,
+  is applied to the tensor elementwise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>alpha</tt> : float (default is 1.67326)</dt>
+<dd>Coefficient of SELU default to 1.67326319217681884765625 (i.e., float32 approximation of 1.6732632423543772848170429916717).</dd>
+<dt><tt>gamma</tt> : float (default is 1.0507)</dt>
+<dd>Coefficient of SELU default to 1.05070102214813232421875 (i.e., float32 approximation of 1.0507009873554804934193349852946).</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Sin-22"></a>**Sin-22**</a>
+
+  Calculates the sine of the given input tensor, element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The sine of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Sinh-22"></a>**Sinh-22**</a>
+
+  Calculates the hyperbolic sine of the given input tensor element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The hyperbolic sine values of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Softplus-22"></a>**Softplus-22**</a>
+
+  Softplus takes one input data (Tensor<T>) and produces one output data
+  (Tensor<T>) where the softplus function, y = ln(exp(x) + 1), is applied to
+  the tensor elementwise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>1D input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>1D input tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Softsign-22"></a>**Softsign-22**</a>
+
+  Calculates the softsign (x/(1+|x|)) of the given input tensor element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The softsign (x/(1+|x|)) values of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="Tan-22"></a>**Tan-22**</a>
+
+  Calculates the tangent of the given input tensor, element-wise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>The tangent of the input tensor computed element-wise</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
+### <a name="ThresholdedRelu-22"></a>**ThresholdedRelu-22**</a>
+
+  ThresholdedRelu takes one input data (Tensor<T>) and produces one output data
+  (Tensor<T>) where the rectified linear function, y = x for x > alpha, y = 0 otherwise,
+  is applied to the tensor elementwise.
+
+#### Version
+
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>alpha</tt> : float (default is 1.0)</dt>
+<dd>Threshold value</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>X</tt> (differentiable) : T</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> (differentiable) : T</dt>
+<dd>Output tensor</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+</dl>
+
 # ai.onnx.preview.training
 ## Version 1 of the 'ai.onnx.preview.training' operator set
 ### <a name="ai.onnx.preview.training.Adagrad-1"></a>**ai.onnx.preview.training.Adagrad-1**</a>
diff --git a/docs/Operators.md b/docs/Operators.md
index 8716cb8e54e..e1601a74d89 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -12,17 +12,17 @@ For an operator input/output's differentiability, it can be differentiable,
 |**Operator**|**Since version**||
 |-|-|-|
 |<a href="#Abs">Abs</a>|<a href="Changelog.md#Abs-13">13</a>, <a href="Changelog.md#Abs-6">6</a>, <a href="Changelog.md#Abs-1">1</a>|
-|<a href="#Acos">Acos</a>|<a href="Changelog.md#Acos-7">7</a>|
-|<a href="#Acosh">Acosh</a>|<a href="Changelog.md#Acosh-9">9</a>|
+|<a href="#Acos">Acos</a>|<a href="Changelog.md#Acos-22">22</a>, <a href="Changelog.md#Acos-7">7</a>|
+|<a href="#Acosh">Acosh</a>|<a href="Changelog.md#Acosh-22">22</a>, <a href="Changelog.md#Acosh-9">9</a>|
 |<a href="#Add">Add</a>|<a href="Changelog.md#Add-14">14</a>, <a href="Changelog.md#Add-13">13</a>, <a href="Changelog.md#Add-7">7</a>, <a href="Changelog.md#Add-6">6</a>, <a href="Changelog.md#Add-1">1</a>|
 |<a href="#And">And</a>|<a href="Changelog.md#And-7">7</a>, <a href="Changelog.md#And-1">1</a>|
 |<a href="#ArgMax">ArgMax</a>|<a href="Changelog.md#ArgMax-13">13</a>, <a href="Changelog.md#ArgMax-12">12</a>, <a href="Changelog.md#ArgMax-11">11</a>, <a href="Changelog.md#ArgMax-1">1</a>|
 |<a href="#ArgMin">ArgMin</a>|<a href="Changelog.md#ArgMin-13">13</a>, <a href="Changelog.md#ArgMin-12">12</a>, <a href="Changelog.md#ArgMin-11">11</a>, <a href="Changelog.md#ArgMin-1">1</a>|
-|<a href="#Asin">Asin</a>|<a href="Changelog.md#Asin-7">7</a>|
-|<a href="#Asinh">Asinh</a>|<a href="Changelog.md#Asinh-9">9</a>|
-|<a href="#Atan">Atan</a>|<a href="Changelog.md#Atan-7">7</a>|
-|<a href="#Atanh">Atanh</a>|<a href="Changelog.md#Atanh-9">9</a>|
-|<a href="#AveragePool">AveragePool</a>|<a href="Changelog.md#AveragePool-19">19</a>, <a href="Changelog.md#AveragePool-11">11</a>, <a href="Changelog.md#AveragePool-10">10</a>, <a href="Changelog.md#AveragePool-7">7</a>, <a href="Changelog.md#AveragePool-1">1</a>|
+|<a href="#Asin">Asin</a>|<a href="Changelog.md#Asin-22">22</a>, <a href="Changelog.md#Asin-7">7</a>|
+|<a href="#Asinh">Asinh</a>|<a href="Changelog.md#Asinh-22">22</a>, <a href="Changelog.md#Asinh-9">9</a>|
+|<a href="#Atan">Atan</a>|<a href="Changelog.md#Atan-22">22</a>, <a href="Changelog.md#Atan-7">7</a>|
+|<a href="#Atanh">Atanh</a>|<a href="Changelog.md#Atanh-22">22</a>, <a href="Changelog.md#Atanh-9">9</a>|
+|<a href="#AveragePool">AveragePool</a>|<a href="Changelog.md#AveragePool-22">22</a>, <a href="Changelog.md#AveragePool-19">19</a>, <a href="Changelog.md#AveragePool-11">11</a>, <a href="Changelog.md#AveragePool-10">10</a>, <a href="Changelog.md#AveragePool-7">7</a>, <a href="Changelog.md#AveragePool-1">1</a>|
 |<a href="#BatchNormalization">BatchNormalization</a>|<a href="Changelog.md#BatchNormalization-15">15</a>, <a href="Changelog.md#BatchNormalization-14">14</a>, <a href="Changelog.md#BatchNormalization-9">9</a>, <a href="Changelog.md#BatchNormalization-7">7</a>, <a href="Changelog.md#BatchNormalization-6">6</a>, <a href="Changelog.md#BatchNormalization-1">1</a>|
 |<a href="#BitShift">BitShift</a>|<a href="Changelog.md#BitShift-11">11</a>|
 |<a href="#BitwiseAnd">BitwiseAnd</a>|<a href="Changelog.md#BitwiseAnd-18">18</a>|
@@ -37,63 +37,63 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#ConcatFromSequence">ConcatFromSequence</a>|<a href="Changelog.md#ConcatFromSequence-11">11</a>|
 |<a href="#Constant">Constant</a>|<a href="Changelog.md#Constant-21">21</a>, <a href="Changelog.md#Constant-19">19</a>, <a href="Changelog.md#Constant-13">13</a>, <a href="Changelog.md#Constant-12">12</a>, <a href="Changelog.md#Constant-11">11</a>, <a href="Changelog.md#Constant-9">9</a>, <a href="Changelog.md#Constant-1">1</a>|
 |<a href="#ConstantOfShape">ConstantOfShape</a>|<a href="Changelog.md#ConstantOfShape-21">21</a>, <a href="Changelog.md#ConstantOfShape-20">20</a>, <a href="Changelog.md#ConstantOfShape-9">9</a>|
-|<a href="#Conv">Conv</a>|<a href="Changelog.md#Conv-11">11</a>, <a href="Changelog.md#Conv-1">1</a>|
+|<a href="#Conv">Conv</a>|<a href="Changelog.md#Conv-22">22</a>, <a href="Changelog.md#Conv-11">11</a>, <a href="Changelog.md#Conv-1">1</a>|
 |<a href="#ConvInteger">ConvInteger</a>|<a href="Changelog.md#ConvInteger-10">10</a>|
-|<a href="#ConvTranspose">ConvTranspose</a>|<a href="Changelog.md#ConvTranspose-11">11</a>, <a href="Changelog.md#ConvTranspose-1">1</a>|
-|<a href="#Cos">Cos</a>|<a href="Changelog.md#Cos-7">7</a>|
-|<a href="#Cosh">Cosh</a>|<a href="Changelog.md#Cosh-9">9</a>|
+|<a href="#ConvTranspose">ConvTranspose</a>|<a href="Changelog.md#ConvTranspose-22">22</a>, <a href="Changelog.md#ConvTranspose-11">11</a>, <a href="Changelog.md#ConvTranspose-1">1</a>|
+|<a href="#Cos">Cos</a>|<a href="Changelog.md#Cos-22">22</a>, <a href="Changelog.md#Cos-7">7</a>|
+|<a href="#Cosh">Cosh</a>|<a href="Changelog.md#Cosh-22">22</a>, <a href="Changelog.md#Cosh-9">9</a>|
 |<a href="#CumSum">CumSum</a>|<a href="Changelog.md#CumSum-14">14</a>, <a href="Changelog.md#CumSum-11">11</a>|
 |<a href="#DFT">DFT</a>|<a href="Changelog.md#DFT-20">20</a>, <a href="Changelog.md#DFT-17">17</a>|
-|<a href="#DeformConv">DeformConv</a>|<a href="Changelog.md#DeformConv-19">19</a>|
+|<a href="#DeformConv">DeformConv</a>|<a href="Changelog.md#DeformConv-22">22</a>, <a href="Changelog.md#DeformConv-19">19</a>|
 |<a href="#DepthToSpace">DepthToSpace</a>|<a href="Changelog.md#DepthToSpace-13">13</a>, <a href="Changelog.md#DepthToSpace-11">11</a>, <a href="Changelog.md#DepthToSpace-1">1</a>|
 |<a href="#DequantizeLinear">DequantizeLinear</a>|<a href="Changelog.md#DequantizeLinear-21">21</a>, <a href="Changelog.md#DequantizeLinear-19">19</a>, <a href="Changelog.md#DequantizeLinear-13">13</a>, <a href="Changelog.md#DequantizeLinear-10">10</a>|
-|<a href="#Det">Det</a>|<a href="Changelog.md#Det-11">11</a>|
+|<a href="#Det">Det</a>|<a href="Changelog.md#Det-22">22</a>, <a href="Changelog.md#Det-11">11</a>|
 |<a href="#Div">Div</a>|<a href="Changelog.md#Div-14">14</a>, <a href="Changelog.md#Div-13">13</a>, <a href="Changelog.md#Div-7">7</a>, <a href="Changelog.md#Div-6">6</a>, <a href="Changelog.md#Div-1">1</a>|
-|<a href="#Dropout">Dropout</a>|<a href="Changelog.md#Dropout-13">13</a>, <a href="Changelog.md#Dropout-12">12</a>, <a href="Changelog.md#Dropout-10">10</a>, <a href="Changelog.md#Dropout-7">7</a>, <a href="Changelog.md#Dropout-6">6</a>, <a href="Changelog.md#Dropout-1">1</a>|
+|<a href="#Dropout">Dropout</a>|<a href="Changelog.md#Dropout-22">22</a>, <a href="Changelog.md#Dropout-13">13</a>, <a href="Changelog.md#Dropout-12">12</a>, <a href="Changelog.md#Dropout-10">10</a>, <a href="Changelog.md#Dropout-7">7</a>, <a href="Changelog.md#Dropout-6">6</a>, <a href="Changelog.md#Dropout-1">1</a>|
 |<a href="#Einsum">Einsum</a>|<a href="Changelog.md#Einsum-12">12</a>|
 |<a href="#Equal">Equal</a>|<a href="Changelog.md#Equal-19">19</a>, <a href="Changelog.md#Equal-13">13</a>, <a href="Changelog.md#Equal-11">11</a>, <a href="Changelog.md#Equal-7">7</a>, <a href="Changelog.md#Equal-1">1</a>|
 |<a href="#Erf">Erf</a>|<a href="Changelog.md#Erf-13">13</a>, <a href="Changelog.md#Erf-9">9</a>|
 |<a href="#Exp">Exp</a>|<a href="Changelog.md#Exp-13">13</a>, <a href="Changelog.md#Exp-6">6</a>, <a href="Changelog.md#Exp-1">1</a>|
 |<a href="#Expand">Expand</a>|<a href="Changelog.md#Expand-13">13</a>, <a href="Changelog.md#Expand-8">8</a>|
-|<a href="#EyeLike">EyeLike</a>|<a href="Changelog.md#EyeLike-9">9</a>|
+|<a href="#EyeLike">EyeLike</a>|<a href="Changelog.md#EyeLike-22">22</a>, <a href="Changelog.md#EyeLike-9">9</a>|
 |<a href="#Flatten">Flatten</a>|<a href="Changelog.md#Flatten-21">21</a>, <a href="Changelog.md#Flatten-13">13</a>, <a href="Changelog.md#Flatten-11">11</a>, <a href="Changelog.md#Flatten-9">9</a>, <a href="Changelog.md#Flatten-1">1</a>|
 |<a href="#Floor">Floor</a>|<a href="Changelog.md#Floor-13">13</a>, <a href="Changelog.md#Floor-6">6</a>, <a href="Changelog.md#Floor-1">1</a>|
-|<a href="#GRU">GRU</a>|<a href="Changelog.md#GRU-14">14</a>, <a href="Changelog.md#GRU-7">7</a>, <a href="Changelog.md#GRU-3">3</a>, <a href="Changelog.md#GRU-1">1</a>|
+|<a href="#GRU">GRU</a>|<a href="Changelog.md#GRU-22">22</a>, <a href="Changelog.md#GRU-14">14</a>, <a href="Changelog.md#GRU-7">7</a>, <a href="Changelog.md#GRU-3">3</a>, <a href="Changelog.md#GRU-1">1</a>|
 |<a href="#Gather">Gather</a>|<a href="Changelog.md#Gather-13">13</a>, <a href="Changelog.md#Gather-11">11</a>, <a href="Changelog.md#Gather-1">1</a>|
 |<a href="#GatherElements">GatherElements</a>|<a href="Changelog.md#GatherElements-13">13</a>, <a href="Changelog.md#GatherElements-11">11</a>|
 |<a href="#GatherND">GatherND</a>|<a href="Changelog.md#GatherND-13">13</a>, <a href="Changelog.md#GatherND-12">12</a>, <a href="Changelog.md#GatherND-11">11</a>|
 |<a href="#Gemm">Gemm</a>|<a href="Changelog.md#Gemm-13">13</a>, <a href="Changelog.md#Gemm-11">11</a>, <a href="Changelog.md#Gemm-9">9</a>, <a href="Changelog.md#Gemm-7">7</a>, <a href="Changelog.md#Gemm-6">6</a>, <a href="Changelog.md#Gemm-1">1</a>|
-|<a href="#GlobalAveragePool">GlobalAveragePool</a>|<a href="Changelog.md#GlobalAveragePool-1">1</a>|
-|<a href="#GlobalLpPool">GlobalLpPool</a>|<a href="Changelog.md#GlobalLpPool-2">2</a>, <a href="Changelog.md#GlobalLpPool-1">1</a>|
-|<a href="#GlobalMaxPool">GlobalMaxPool</a>|<a href="Changelog.md#GlobalMaxPool-1">1</a>|
+|<a href="#GlobalAveragePool">GlobalAveragePool</a>|<a href="Changelog.md#GlobalAveragePool-22">22</a>, <a href="Changelog.md#GlobalAveragePool-1">1</a>|
+|<a href="#GlobalLpPool">GlobalLpPool</a>|<a href="Changelog.md#GlobalLpPool-22">22</a>, <a href="Changelog.md#GlobalLpPool-2">2</a>, <a href="Changelog.md#GlobalLpPool-1">1</a>|
+|<a href="#GlobalMaxPool">GlobalMaxPool</a>|<a href="Changelog.md#GlobalMaxPool-22">22</a>, <a href="Changelog.md#GlobalMaxPool-1">1</a>|
 |<a href="#Greater">Greater</a>|<a href="Changelog.md#Greater-13">13</a>, <a href="Changelog.md#Greater-9">9</a>, <a href="Changelog.md#Greater-7">7</a>, <a href="Changelog.md#Greater-1">1</a>|
-|<a href="#GridSample">GridSample</a>|<a href="Changelog.md#GridSample-20">20</a>, <a href="Changelog.md#GridSample-16">16</a>|
+|<a href="#GridSample">GridSample</a>|<a href="Changelog.md#GridSample-22">22</a>, <a href="Changelog.md#GridSample-20">20</a>, <a href="Changelog.md#GridSample-16">16</a>|
 |<a href="#Hardmax">Hardmax</a>|<a href="Changelog.md#Hardmax-13">13</a>, <a href="Changelog.md#Hardmax-11">11</a>, <a href="Changelog.md#Hardmax-1">1</a>|
 |<a href="#Identity">Identity</a>|<a href="Changelog.md#Identity-21">21</a>, <a href="Changelog.md#Identity-19">19</a>, <a href="Changelog.md#Identity-16">16</a>, <a href="Changelog.md#Identity-14">14</a>, <a href="Changelog.md#Identity-13">13</a>, <a href="Changelog.md#Identity-1">1</a>|
 |<a href="#If">If</a>|<a href="Changelog.md#If-21">21</a>, <a href="Changelog.md#If-19">19</a>, <a href="Changelog.md#If-16">16</a>, <a href="Changelog.md#If-13">13</a>, <a href="Changelog.md#If-11">11</a>, <a href="Changelog.md#If-1">1</a>|
 |<a href="#ImageDecoder">ImageDecoder</a>|<a href="Changelog.md#ImageDecoder-20">20</a>|
-|<a href="#InstanceNormalization">InstanceNormalization</a>|<a href="Changelog.md#InstanceNormalization-6">6</a>, <a href="Changelog.md#InstanceNormalization-1">1</a>|
+|<a href="#InstanceNormalization">InstanceNormalization</a>|<a href="Changelog.md#InstanceNormalization-22">22</a>, <a href="Changelog.md#InstanceNormalization-6">6</a>, <a href="Changelog.md#InstanceNormalization-1">1</a>|
 |<a href="#IsInf">IsInf</a>|<a href="Changelog.md#IsInf-20">20</a>, <a href="Changelog.md#IsInf-10">10</a>|
 |<a href="#IsNaN">IsNaN</a>|<a href="Changelog.md#IsNaN-20">20</a>, <a href="Changelog.md#IsNaN-13">13</a>, <a href="Changelog.md#IsNaN-9">9</a>|
 |<a href="#LRN">LRN</a>|<a href="Changelog.md#LRN-13">13</a>, <a href="Changelog.md#LRN-1">1</a>|
-|<a href="#LSTM">LSTM</a>|<a href="Changelog.md#LSTM-14">14</a>, <a href="Changelog.md#LSTM-7">7</a>, <a href="Changelog.md#LSTM-1">1</a>|
+|<a href="#LSTM">LSTM</a>|<a href="Changelog.md#LSTM-22">22</a>, <a href="Changelog.md#LSTM-14">14</a>, <a href="Changelog.md#LSTM-7">7</a>, <a href="Changelog.md#LSTM-1">1</a>|
 |<a href="#Less">Less</a>|<a href="Changelog.md#Less-13">13</a>, <a href="Changelog.md#Less-9">9</a>, <a href="Changelog.md#Less-7">7</a>, <a href="Changelog.md#Less-1">1</a>|
 |<a href="#Log">Log</a>|<a href="Changelog.md#Log-13">13</a>, <a href="Changelog.md#Log-6">6</a>, <a href="Changelog.md#Log-1">1</a>|
 |<a href="#Loop">Loop</a>|<a href="Changelog.md#Loop-21">21</a>, <a href="Changelog.md#Loop-19">19</a>, <a href="Changelog.md#Loop-16">16</a>, <a href="Changelog.md#Loop-13">13</a>, <a href="Changelog.md#Loop-11">11</a>, <a href="Changelog.md#Loop-1">1</a>|
-|<a href="#LpNormalization">LpNormalization</a>|<a href="Changelog.md#LpNormalization-1">1</a>|
-|<a href="#LpPool">LpPool</a>|<a href="Changelog.md#LpPool-18">18</a>, <a href="Changelog.md#LpPool-11">11</a>, <a href="Changelog.md#LpPool-2">2</a>, <a href="Changelog.md#LpPool-1">1</a>|
+|<a href="#LpNormalization">LpNormalization</a>|<a href="Changelog.md#LpNormalization-22">22</a>, <a href="Changelog.md#LpNormalization-1">1</a>|
+|<a href="#LpPool">LpPool</a>|<a href="Changelog.md#LpPool-22">22</a>, <a href="Changelog.md#LpPool-18">18</a>, <a href="Changelog.md#LpPool-11">11</a>, <a href="Changelog.md#LpPool-2">2</a>, <a href="Changelog.md#LpPool-1">1</a>|
 |<a href="#MatMul">MatMul</a>|<a href="Changelog.md#MatMul-13">13</a>, <a href="Changelog.md#MatMul-9">9</a>, <a href="Changelog.md#MatMul-1">1</a>|
 |<a href="#MatMulInteger">MatMulInteger</a>|<a href="Changelog.md#MatMulInteger-10">10</a>|
 |<a href="#Max">Max</a>|<a href="Changelog.md#Max-13">13</a>, <a href="Changelog.md#Max-12">12</a>, <a href="Changelog.md#Max-8">8</a>, <a href="Changelog.md#Max-6">6</a>, <a href="Changelog.md#Max-1">1</a>|
-|<a href="#MaxPool">MaxPool</a>|<a href="Changelog.md#MaxPool-12">12</a>, <a href="Changelog.md#MaxPool-11">11</a>, <a href="Changelog.md#MaxPool-10">10</a>, <a href="Changelog.md#MaxPool-8">8</a>, <a href="Changelog.md#MaxPool-1">1</a>|
-|<a href="#MaxRoiPool">MaxRoiPool</a>|<a href="Changelog.md#MaxRoiPool-1">1</a>|
-|<a href="#MaxUnpool">MaxUnpool</a>|<a href="Changelog.md#MaxUnpool-11">11</a>, <a href="Changelog.md#MaxUnpool-9">9</a>|
+|<a href="#MaxPool">MaxPool</a>|<a href="Changelog.md#MaxPool-22">22</a>, <a href="Changelog.md#MaxPool-12">12</a>, <a href="Changelog.md#MaxPool-11">11</a>, <a href="Changelog.md#MaxPool-10">10</a>, <a href="Changelog.md#MaxPool-8">8</a>, <a href="Changelog.md#MaxPool-1">1</a>|
+|<a href="#MaxRoiPool">MaxRoiPool</a>|<a href="Changelog.md#MaxRoiPool-22">22</a>, <a href="Changelog.md#MaxRoiPool-1">1</a>|
+|<a href="#MaxUnpool">MaxUnpool</a>|<a href="Changelog.md#MaxUnpool-22">22</a>, <a href="Changelog.md#MaxUnpool-11">11</a>, <a href="Changelog.md#MaxUnpool-9">9</a>|
 |<a href="#Mean">Mean</a>|<a href="Changelog.md#Mean-13">13</a>, <a href="Changelog.md#Mean-8">8</a>, <a href="Changelog.md#Mean-6">6</a>, <a href="Changelog.md#Mean-1">1</a>|
 |<a href="#MelWeightMatrix">MelWeightMatrix</a>|<a href="Changelog.md#MelWeightMatrix-17">17</a>|
 |<a href="#Min">Min</a>|<a href="Changelog.md#Min-13">13</a>, <a href="Changelog.md#Min-12">12</a>, <a href="Changelog.md#Min-8">8</a>, <a href="Changelog.md#Min-6">6</a>, <a href="Changelog.md#Min-1">1</a>|
 |<a href="#Mod">Mod</a>|<a href="Changelog.md#Mod-13">13</a>, <a href="Changelog.md#Mod-10">10</a>|
 |<a href="#Mul">Mul</a>|<a href="Changelog.md#Mul-14">14</a>, <a href="Changelog.md#Mul-13">13</a>, <a href="Changelog.md#Mul-7">7</a>, <a href="Changelog.md#Mul-6">6</a>, <a href="Changelog.md#Mul-1">1</a>|
-|<a href="#Multinomial">Multinomial</a>|<a href="Changelog.md#Multinomial-7">7</a>|
+|<a href="#Multinomial">Multinomial</a>|<a href="Changelog.md#Multinomial-22">22</a>, <a href="Changelog.md#Multinomial-7">7</a>|
 |<a href="#Neg">Neg</a>|<a href="Changelog.md#Neg-13">13</a>, <a href="Changelog.md#Neg-6">6</a>, <a href="Changelog.md#Neg-1">1</a>|
 |<a href="#NonMaxSuppression">NonMaxSuppression</a>|<a href="Changelog.md#NonMaxSuppression-11">11</a>, <a href="Changelog.md#NonMaxSuppression-10">10</a>|
 |<a href="#NonZero">NonZero</a>|<a href="Changelog.md#NonZero-13">13</a>, <a href="Changelog.md#NonZero-9">9</a>|
@@ -108,11 +108,11 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#QLinearConv">QLinearConv</a>|<a href="Changelog.md#QLinearConv-10">10</a>|
 |<a href="#QLinearMatMul">QLinearMatMul</a>|<a href="Changelog.md#QLinearMatMul-21">21</a>, <a href="Changelog.md#QLinearMatMul-10">10</a>|
 |<a href="#QuantizeLinear">QuantizeLinear</a>|<a href="Changelog.md#QuantizeLinear-21">21</a>, <a href="Changelog.md#QuantizeLinear-19">19</a>, <a href="Changelog.md#QuantizeLinear-13">13</a>, <a href="Changelog.md#QuantizeLinear-10">10</a>|
-|<a href="#RNN">RNN</a>|<a href="Changelog.md#RNN-14">14</a>, <a href="Changelog.md#RNN-7">7</a>, <a href="Changelog.md#RNN-1">1</a>|
-|<a href="#RandomNormal">RandomNormal</a>|<a href="Changelog.md#RandomNormal-1">1</a>|
-|<a href="#RandomNormalLike">RandomNormalLike</a>|<a href="Changelog.md#RandomNormalLike-1">1</a>|
-|<a href="#RandomUniform">RandomUniform</a>|<a href="Changelog.md#RandomUniform-1">1</a>|
-|<a href="#RandomUniformLike">RandomUniformLike</a>|<a href="Changelog.md#RandomUniformLike-1">1</a>|
+|<a href="#RNN">RNN</a>|<a href="Changelog.md#RNN-22">22</a>, <a href="Changelog.md#RNN-14">14</a>, <a href="Changelog.md#RNN-7">7</a>, <a href="Changelog.md#RNN-1">1</a>|
+|<a href="#RandomNormal">RandomNormal</a>|<a href="Changelog.md#RandomNormal-22">22</a>, <a href="Changelog.md#RandomNormal-1">1</a>|
+|<a href="#RandomNormalLike">RandomNormalLike</a>|<a href="Changelog.md#RandomNormalLike-22">22</a>, <a href="Changelog.md#RandomNormalLike-1">1</a>|
+|<a href="#RandomUniform">RandomUniform</a>|<a href="Changelog.md#RandomUniform-22">22</a>, <a href="Changelog.md#RandomUniform-1">1</a>|
+|<a href="#RandomUniformLike">RandomUniformLike</a>|<a href="Changelog.md#RandomUniformLike-22">22</a>, <a href="Changelog.md#RandomUniformLike-1">1</a>|
 |<a href="#Reciprocal">Reciprocal</a>|<a href="Changelog.md#Reciprocal-13">13</a>, <a href="Changelog.md#Reciprocal-6">6</a>, <a href="Changelog.md#Reciprocal-1">1</a>|
 |<a href="#ReduceMax">ReduceMax</a>|<a href="Changelog.md#ReduceMax-20">20</a>, <a href="Changelog.md#ReduceMax-18">18</a>, <a href="Changelog.md#ReduceMax-13">13</a>, <a href="Changelog.md#ReduceMax-12">12</a>, <a href="Changelog.md#ReduceMax-11">11</a>, <a href="Changelog.md#ReduceMax-1">1</a>|
 |<a href="#ReduceMean">ReduceMean</a>|<a href="Changelog.md#ReduceMean-18">18</a>, <a href="Changelog.md#ReduceMean-13">13</a>, <a href="Changelog.md#ReduceMean-11">11</a>, <a href="Changelog.md#ReduceMean-1">1</a>|
@@ -123,8 +123,8 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Reshape">Reshape</a>|<a href="Changelog.md#Reshape-21">21</a>, <a href="Changelog.md#Reshape-19">19</a>, <a href="Changelog.md#Reshape-14">14</a>, <a href="Changelog.md#Reshape-13">13</a>, <a href="Changelog.md#Reshape-5">5</a>, <a href="Changelog.md#Reshape-1">1</a>|
 |<a href="#Resize">Resize</a>|<a href="Changelog.md#Resize-19">19</a>, <a href="Changelog.md#Resize-18">18</a>, <a href="Changelog.md#Resize-13">13</a>, <a href="Changelog.md#Resize-11">11</a>, <a href="Changelog.md#Resize-10">10</a>|
 |<a href="#ReverseSequence">ReverseSequence</a>|<a href="Changelog.md#ReverseSequence-10">10</a>|
-|<a href="#RoiAlign">RoiAlign</a>|<a href="Changelog.md#RoiAlign-16">16</a>, <a href="Changelog.md#RoiAlign-10">10</a>|
-|<a href="#Round">Round</a>|<a href="Changelog.md#Round-11">11</a>|
+|<a href="#RoiAlign">RoiAlign</a>|<a href="Changelog.md#RoiAlign-22">22</a>, <a href="Changelog.md#RoiAlign-16">16</a>, <a href="Changelog.md#RoiAlign-10">10</a>|
+|<a href="#Round">Round</a>|<a href="Changelog.md#Round-22">22</a>, <a href="Changelog.md#Round-11">11</a>|
 |<a href="#STFT">STFT</a>|<a href="Changelog.md#STFT-17">17</a>|
 |<a href="#Scan">Scan</a>|<a href="Changelog.md#Scan-21">21</a>, <a href="Changelog.md#Scan-19">19</a>, <a href="Changelog.md#Scan-16">16</a>, <a href="Changelog.md#Scan-11">11</a>, <a href="Changelog.md#Scan-9">9</a>, <a href="Changelog.md#Scan-8">8</a>|
 |<a href="#Scatter">Scatter</a> (deprecated)|<a href="Changelog.md#Scatter-11">11</a>, <a href="Changelog.md#Scatter-9">9</a>|
@@ -139,8 +139,8 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Shape">Shape</a>|<a href="Changelog.md#Shape-21">21</a>, <a href="Changelog.md#Shape-19">19</a>, <a href="Changelog.md#Shape-15">15</a>, <a href="Changelog.md#Shape-13">13</a>, <a href="Changelog.md#Shape-1">1</a>|
 |<a href="#Sigmoid">Sigmoid</a>|<a href="Changelog.md#Sigmoid-13">13</a>, <a href="Changelog.md#Sigmoid-6">6</a>, <a href="Changelog.md#Sigmoid-1">1</a>|
 |<a href="#Sign">Sign</a>|<a href="Changelog.md#Sign-13">13</a>, <a href="Changelog.md#Sign-9">9</a>|
-|<a href="#Sin">Sin</a>|<a href="Changelog.md#Sin-7">7</a>|
-|<a href="#Sinh">Sinh</a>|<a href="Changelog.md#Sinh-9">9</a>|
+|<a href="#Sin">Sin</a>|<a href="Changelog.md#Sin-22">22</a>, <a href="Changelog.md#Sin-7">7</a>|
+|<a href="#Sinh">Sinh</a>|<a href="Changelog.md#Sinh-22">22</a>, <a href="Changelog.md#Sinh-9">9</a>|
 |<a href="#Size">Size</a>|<a href="Changelog.md#Size-21">21</a>, <a href="Changelog.md#Size-19">19</a>, <a href="Changelog.md#Size-13">13</a>, <a href="Changelog.md#Size-1">1</a>|
 |<a href="#Slice">Slice</a>|<a href="Changelog.md#Slice-13">13</a>, <a href="Changelog.md#Slice-11">11</a>, <a href="Changelog.md#Slice-10">10</a>, <a href="Changelog.md#Slice-1">1</a>|
 |<a href="#SpaceToDepth">SpaceToDepth</a>|<a href="Changelog.md#SpaceToDepth-13">13</a>, <a href="Changelog.md#SpaceToDepth-1">1</a>|
@@ -153,7 +153,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#StringSplit">StringSplit</a>|<a href="Changelog.md#StringSplit-20">20</a>|
 |<a href="#Sub">Sub</a>|<a href="Changelog.md#Sub-14">14</a>, <a href="Changelog.md#Sub-13">13</a>, <a href="Changelog.md#Sub-7">7</a>, <a href="Changelog.md#Sub-6">6</a>, <a href="Changelog.md#Sub-1">1</a>|
 |<a href="#Sum">Sum</a>|<a href="Changelog.md#Sum-13">13</a>, <a href="Changelog.md#Sum-8">8</a>, <a href="Changelog.md#Sum-6">6</a>, <a href="Changelog.md#Sum-1">1</a>|
-|<a href="#Tan">Tan</a>|<a href="Changelog.md#Tan-7">7</a>|
+|<a href="#Tan">Tan</a>|<a href="Changelog.md#Tan-22">22</a>, <a href="Changelog.md#Tan-7">7</a>|
 |<a href="#Tanh">Tanh</a>|<a href="Changelog.md#Tanh-13">13</a>, <a href="Changelog.md#Tanh-6">6</a>, <a href="Changelog.md#Tanh-1">1</a>|
 |<a href="#TfIdfVectorizer">TfIdfVectorizer</a>|<a href="Changelog.md#TfIdfVectorizer-9">9</a>|
 |<a href="#Tile">Tile</a>|<a href="Changelog.md#Tile-13">13</a>, <a href="Changelog.md#Tile-6">6</a>, <a href="Changelog.md#Tile-1">1</a>|
@@ -167,28 +167,28 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Xor">Xor</a>|<a href="Changelog.md#Xor-7">7</a>, <a href="Changelog.md#Xor-1">1</a>|
 |**Function**|**Since version**|**Function version**|
 |<a href="#AffineGrid">AffineGrid</a>|<a href="Changelog.md#AffineGrid-20">20</a>|20|
-|<a href="#Bernoulli">Bernoulli</a>|<a href="Changelog.md#Bernoulli-15">15</a>|15|
+|<a href="#Bernoulli">Bernoulli</a>|<a href="Changelog.md#Bernoulli-22">22</a>, <a href="Changelog.md#Bernoulli-15">15</a>|22|
 |<a href="#BlackmanWindow">BlackmanWindow</a>|<a href="Changelog.md#BlackmanWindow-17">17</a>|17|
 |<a href="#CastLike">CastLike</a>|<a href="Changelog.md#CastLike-21">21</a>, <a href="Changelog.md#CastLike-19">19</a>, <a href="Changelog.md#CastLike-15">15</a>|21|
 |<a href="#Celu">Celu</a>|<a href="Changelog.md#Celu-12">12</a>|12|
 |<a href="#CenterCropPad">CenterCropPad</a>|<a href="Changelog.md#CenterCropPad-18">18</a>|18|
 |<a href="#Clip">Clip</a>|<a href="Changelog.md#Clip-13">13</a>, <a href="Changelog.md#Clip-12">12</a>, <a href="Changelog.md#Clip-11">11</a>, <a href="Changelog.md#Clip-6">6</a>, <a href="Changelog.md#Clip-1">1</a>|13|
 |<a href="#DynamicQuantizeLinear">DynamicQuantizeLinear</a>|<a href="Changelog.md#DynamicQuantizeLinear-11">11</a>|11|
-|<a href="#Elu">Elu</a>|<a href="Changelog.md#Elu-6">6</a>, <a href="Changelog.md#Elu-1">1</a>|18|
+|<a href="#Elu">Elu</a>|<a href="Changelog.md#Elu-22">22</a>, <a href="Changelog.md#Elu-6">6</a>, <a href="Changelog.md#Elu-1">1</a>|18|
 |<a href="#Gelu">Gelu</a>|<a href="Changelog.md#Gelu-20">20</a>|20|
 |<a href="#GreaterOrEqual">GreaterOrEqual</a>|<a href="Changelog.md#GreaterOrEqual-16">16</a>, <a href="Changelog.md#GreaterOrEqual-12">12</a>|16|
 |<a href="#GroupNormalization">GroupNormalization</a>|<a href="Changelog.md#GroupNormalization-21">21</a>, <a href="Changelog.md#GroupNormalization-18">18</a>|21|
 |<a href="#HammingWindow">HammingWindow</a>|<a href="Changelog.md#HammingWindow-17">17</a>|17|
 |<a href="#HannWindow">HannWindow</a>|<a href="Changelog.md#HannWindow-17">17</a>|17|
-|<a href="#HardSigmoid">HardSigmoid</a>|<a href="Changelog.md#HardSigmoid-6">6</a>, <a href="Changelog.md#HardSigmoid-1">1</a>|18|
-|<a href="#HardSwish">HardSwish</a>|<a href="Changelog.md#HardSwish-14">14</a>|14|
+|<a href="#HardSigmoid">HardSigmoid</a>|<a href="Changelog.md#HardSigmoid-22">22</a>, <a href="Changelog.md#HardSigmoid-6">6</a>, <a href="Changelog.md#HardSigmoid-1">1</a>|18|
+|<a href="#HardSwish">HardSwish</a>|<a href="Changelog.md#HardSwish-22">22</a>, <a href="Changelog.md#HardSwish-14">14</a>|22|
 |<a href="#LayerNormalization">LayerNormalization</a>|<a href="Changelog.md#LayerNormalization-17">17</a>|17, 18|
 |<a href="#LeakyRelu">LeakyRelu</a>|<a href="Changelog.md#LeakyRelu-16">16</a>, <a href="Changelog.md#LeakyRelu-6">6</a>, <a href="Changelog.md#LeakyRelu-1">1</a>|16|
 |<a href="#LessOrEqual">LessOrEqual</a>|<a href="Changelog.md#LessOrEqual-16">16</a>, <a href="Changelog.md#LessOrEqual-12">12</a>|16|
 |<a href="#LogSoftmax">LogSoftmax</a>|<a href="Changelog.md#LogSoftmax-13">13</a>, <a href="Changelog.md#LogSoftmax-11">11</a>, <a href="Changelog.md#LogSoftmax-1">1</a>|13, 18|
 |<a href="#MeanVarianceNormalization">MeanVarianceNormalization</a>|<a href="Changelog.md#MeanVarianceNormalization-13">13</a>, <a href="Changelog.md#MeanVarianceNormalization-9">9</a>|13, 18|
-|<a href="#Mish">Mish</a>|<a href="Changelog.md#Mish-18">18</a>|18|
-|<a href="#NegativeLogLikelihoodLoss">NegativeLogLikelihoodLoss</a>|<a href="Changelog.md#NegativeLogLikelihoodLoss-13">13</a>, <a href="Changelog.md#NegativeLogLikelihoodLoss-12">12</a>|13|
+|<a href="#Mish">Mish</a>|<a href="Changelog.md#Mish-22">22</a>, <a href="Changelog.md#Mish-18">18</a>|22|
+|<a href="#NegativeLogLikelihoodLoss">NegativeLogLikelihoodLoss</a>|<a href="Changelog.md#NegativeLogLikelihoodLoss-22">22</a>, <a href="Changelog.md#NegativeLogLikelihoodLoss-13">13</a>, <a href="Changelog.md#NegativeLogLikelihoodLoss-12">12</a>|22|
 |<a href="#PRelu">PRelu</a>|<a href="Changelog.md#PRelu-16">16</a>, <a href="Changelog.md#PRelu-9">9</a>, <a href="Changelog.md#PRelu-7">7</a>, <a href="Changelog.md#PRelu-6">6</a>, <a href="Changelog.md#PRelu-1">1</a>|16|
 |<a href="#Range">Range</a>|<a href="Changelog.md#Range-11">11</a>|11|
 |<a href="#ReduceL1">ReduceL1</a>|<a href="Changelog.md#ReduceL1-18">18</a>, <a href="Changelog.md#ReduceL1-13">13</a>, <a href="Changelog.md#ReduceL1-11">11</a>, <a href="Changelog.md#ReduceL1-1">1</a>|18|
@@ -197,14 +197,14 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#ReduceLogSumExp">ReduceLogSumExp</a>|<a href="Changelog.md#ReduceLogSumExp-18">18</a>, <a href="Changelog.md#ReduceLogSumExp-13">13</a>, <a href="Changelog.md#ReduceLogSumExp-11">11</a>, <a href="Changelog.md#ReduceLogSumExp-1">1</a>|18|
 |<a href="#ReduceSumSquare">ReduceSumSquare</a>|<a href="Changelog.md#ReduceSumSquare-18">18</a>, <a href="Changelog.md#ReduceSumSquare-13">13</a>, <a href="Changelog.md#ReduceSumSquare-11">11</a>, <a href="Changelog.md#ReduceSumSquare-1">1</a>|18|
 |<a href="#Relu">Relu</a>|<a href="Changelog.md#Relu-14">14</a>, <a href="Changelog.md#Relu-13">13</a>, <a href="Changelog.md#Relu-6">6</a>, <a href="Changelog.md#Relu-1">1</a>|18|
-|<a href="#Selu">Selu</a>|<a href="Changelog.md#Selu-6">6</a>, <a href="Changelog.md#Selu-1">1</a>|18|
+|<a href="#Selu">Selu</a>|<a href="Changelog.md#Selu-22">22</a>, <a href="Changelog.md#Selu-6">6</a>, <a href="Changelog.md#Selu-1">1</a>|18|
 |<a href="#SequenceMap">SequenceMap</a>|<a href="Changelog.md#SequenceMap-17">17</a>|17|
 |<a href="#Shrink">Shrink</a>|<a href="Changelog.md#Shrink-9">9</a>|18|
 |<a href="#Softmax">Softmax</a>|<a href="Changelog.md#Softmax-13">13</a>, <a href="Changelog.md#Softmax-11">11</a>, <a href="Changelog.md#Softmax-1">1</a>|13, 18|
 |<a href="#SoftmaxCrossEntropyLoss">SoftmaxCrossEntropyLoss</a>|<a href="Changelog.md#SoftmaxCrossEntropyLoss-13">13</a>, <a href="Changelog.md#SoftmaxCrossEntropyLoss-12">12</a>|13|
-|<a href="#Softplus">Softplus</a>|<a href="Changelog.md#Softplus-1">1</a>|18|
-|<a href="#Softsign">Softsign</a>|<a href="Changelog.md#Softsign-1">1</a>|18|
-|<a href="#ThresholdedRelu">ThresholdedRelu</a>|<a href="Changelog.md#ThresholdedRelu-10">10</a>|18|
+|<a href="#Softplus">Softplus</a>|<a href="Changelog.md#Softplus-22">22</a>, <a href="Changelog.md#Softplus-1">1</a>|18|
+|<a href="#Softsign">Softsign</a>|<a href="Changelog.md#Softsign-22">22</a>, <a href="Changelog.md#Softsign-1">1</a>|18|
+|<a href="#ThresholdedRelu">ThresholdedRelu</a>|<a href="Changelog.md#ThresholdedRelu-22">22</a>, <a href="Changelog.md#ThresholdedRelu-10">10</a>|18|
 
 ### ai.onnx.preview.training
 |**Operator**|**Since version**||
@@ -296,7 +296,9 @@ def abs(input: np.ndarray) -> np.ndarray:  # noqa: A001
 
 #### Version
 
-This version of the operator has been available since version 7 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Acos-7">7</a>
 
 #### Inputs
 
@@ -315,7 +317,7 @@ This version of the operator has been available since version 7 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -350,7 +352,9 @@ expect(node, inputs=[x], outputs=[y], name="test_acos")
 
 #### Version
 
-This version of the operator has been available since version 9 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Acosh-9">9</a>
 
 #### Inputs
 
@@ -369,7 +373,7 @@ This version of the operator has been available since version 9 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -1398,7 +1402,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 7 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Asin-7">7</a>
 
 #### Inputs
 
@@ -1417,7 +1423,7 @@ This version of the operator has been available since version 7 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -1452,7 +1458,9 @@ expect(node, inputs=[x], outputs=[y], name="test_asin")
 
 #### Version
 
-This version of the operator has been available since version 9 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Asinh-9">9</a>
 
 #### Inputs
 
@@ -1471,7 +1479,7 @@ This version of the operator has been available since version 9 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -1506,7 +1514,9 @@ expect(node, inputs=[x], outputs=[y], name="test_asinh")
 
 #### Version
 
-This version of the operator has been available since version 7 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Atan-7">7</a>
 
 #### Inputs
 
@@ -1525,7 +1535,7 @@ This version of the operator has been available since version 7 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -1560,7 +1570,9 @@ expect(node, inputs=[x], outputs=[y], name="test_atan")
 
 #### Version
 
-This version of the operator has been available since version 9 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Atanh-9">9</a>
 
 #### Inputs
 
@@ -1579,7 +1591,7 @@ This version of the operator has been available since version 9 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -1645,9 +1657,9 @@ expect(node, inputs=[x], outputs=[y], name="test_atanh")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#AveragePool-1">1</a>, <a href="Changelog.md#AveragePool-7">7</a>, <a href="Changelog.md#AveragePool-10">10</a>, <a href="Changelog.md#AveragePool-11">11</a>
+Other versions of this operator: <a href="Changelog.md#AveragePool-1">1</a>, <a href="Changelog.md#AveragePool-7">7</a>, <a href="Changelog.md#AveragePool-10">10</a>, <a href="Changelog.md#AveragePool-11">11</a>, <a href="Changelog.md#AveragePool-19">19</a>
 
 #### Attributes
 
@@ -1685,7 +1697,7 @@ Other versions of this operator: <a href="Changelog.md#AveragePool-1">1</a>, <a
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -2611,7 +2623,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 15 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Bernoulli-15">15</a>
 
 #### Attributes
 
@@ -2639,9 +2653,9 @@ This version of the operator has been available since version 15 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input types to float tensors.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bool)</dt>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(bool)</dt>
 <dd>Constrain output types to all numeric tensors and bool tensors.</dd>
 </dl>
 
@@ -3874,9 +3888,9 @@ Other versions of this operator: <a href="Changelog.md#CastLike-15">15</a>, <a h
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain input types. Casting from complex is not supported.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain output types. Casting to complex is not supported.</dd>
 </dl>
 
@@ -5547,9 +5561,9 @@ expect(node, inputs=[x], outputs=[y], name="test_constantofshape_int_zeros")
 
 #### Version
 
-This version of the operator has been available since version 11 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Conv-1">1</a>
+Other versions of this operator: <a href="Changelog.md#Conv-1">1</a>, <a href="Changelog.md#Conv-11">11</a>
 
 #### Attributes
 
@@ -5589,7 +5603,7 @@ Other versions of this operator: <a href="Changelog.md#Conv-1">1</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -6005,9 +6019,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 11 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#ConvTranspose-1">1</a>
+Other versions of this operator: <a href="Changelog.md#ConvTranspose-1">1</a>, <a href="Changelog.md#ConvTranspose-11">11</a>
 
 #### Attributes
 
@@ -6051,7 +6065,7 @@ Other versions of this operator: <a href="Changelog.md#ConvTranspose-1">1</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -6492,7 +6506,9 @@ expect(node, inputs=[x, W], outputs=[y], name="test_convtranspose_pads")
 
 #### Version
 
-This version of the operator has been available since version 7 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Cos-7">7</a>
 
 #### Inputs
 
@@ -6511,7 +6527,7 @@ This version of the operator has been available since version 7 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -6546,7 +6562,9 @@ expect(node, inputs=[x], outputs=[y], name="test_cos")
 
 #### Version
 
-This version of the operator has been available since version 9 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Cosh-9">9</a>
 
 #### Inputs
 
@@ -6565,7 +6583,7 @@ This version of the operator has been available since version 9 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -6942,7 +6960,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#DeformConv-19">19</a>
 
 #### Attributes
 
@@ -6986,7 +7006,7 @@ This version of the operator has been available since version 19 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -7758,7 +7778,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 11 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Det-11">11</a>
 
 #### Inputs
 
@@ -7777,7 +7799,7 @@ This version of the operator has been available since version 11 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to floating-point tensors.</dd>
 </dl>
 
@@ -7927,9 +7949,9 @@ expect(node, inputs=[x, y], outputs=[z], name="test_div_bcast")
 
 #### Version
 
-This version of the operator has been available since version 13 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Dropout-1">1</a>, <a href="Changelog.md#Dropout-6">6</a>, <a href="Changelog.md#Dropout-7">7</a>, <a href="Changelog.md#Dropout-10">10</a>, <a href="Changelog.md#Dropout-12">12</a>
+Other versions of this operator: <a href="Changelog.md#Dropout-1">1</a>, <a href="Changelog.md#Dropout-6">6</a>, <a href="Changelog.md#Dropout-7">7</a>, <a href="Changelog.md#Dropout-10">10</a>, <a href="Changelog.md#Dropout-12">12</a>, <a href="Changelog.md#Dropout-13">13</a>
 
 #### Attributes
 
@@ -7961,9 +7983,9 @@ Other versions of this operator: <a href="Changelog.md#Dropout-1">1</a>, <a href
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
 <dd>Constrain input 'ratio' types to float tensors.</dd>
 <dt><tt>T2</tt> : tensor(bool)</dt>
 <dd>Constrain output 'mask' types to boolean tensors.</dd>
@@ -8513,9 +8535,9 @@ expect(node, inputs=[X], outputs=[Y], name="test_einsum_transpose")
 
 #### Version
 
-This version of the operator has been available since version 6 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Elu-1">1</a>
+Other versions of this operator: <a href="Changelog.md#Elu-1">1</a>, <a href="Changelog.md#Elu-6">6</a>
 
 #### Attributes
 
@@ -8541,7 +8563,7 @@ Other versions of this operator: <a href="Changelog.md#Elu-1">1</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -8779,7 +8801,7 @@ Other versions of this operator: <a href="Changelog.md#Exp-1">1</a>, <a href="Ch
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -8929,7 +8951,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 9 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#EyeLike-9">9</a>
 
 #### Attributes
 
@@ -8957,9 +8981,9 @@ This version of the operator has been available since version 9 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool)</dt>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(bool)</dt>
 <dd>Constrain input types. Strings and complex are not supported.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool)</dt>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(bool)</dt>
 <dd>Constrain output types. Strings and complex are not supported.</dd>
 </dl>
 
@@ -9256,9 +9280,9 @@ expect(node, inputs=[x], outputs=[y], name="test_floor")
 
 #### Version
 
-This version of the operator has been available since version 14 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#GRU-1">1</a>, <a href="Changelog.md#GRU-3">3</a>, <a href="Changelog.md#GRU-7">7</a>
+Other versions of this operator: <a href="Changelog.md#GRU-1">1</a>, <a href="Changelog.md#GRU-3">3</a>, <a href="Changelog.md#GRU-7">7</a>, <a href="Changelog.md#GRU-14">14</a>
 
 #### Attributes
 
@@ -9310,7 +9334,7 @@ Other versions of this operator: <a href="Changelog.md#GRU-1">1</a>, <a href="Ch
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>T1</tt> : tensor(int32)</dt>
 <dd>Constrain seq_lens to integer tensor.</dd>
@@ -10432,7 +10456,9 @@ expect(node, inputs=[a, b, c], outputs=[y], name="test_gemm_transposeB")
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#GlobalAveragePool-1">1</a>
 
 #### Inputs
 
@@ -10451,7 +10477,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -10510,9 +10536,9 @@ expect(node, inputs=[x], outputs=[y], name="test_globalaveragepool_precomputed")
 
 #### Version
 
-This version of the operator has been available since version 2 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#GlobalLpPool-1">1</a>
+Other versions of this operator: <a href="Changelog.md#GlobalLpPool-1">1</a>, <a href="Changelog.md#GlobalLpPool-2">2</a>
 
 #### Attributes
 
@@ -10551,7 +10577,9 @@ Other versions of this operator: <a href="Changelog.md#GlobalLpPool-1">1</a>
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#GlobalMaxPool-1">1</a>
 
 #### Inputs
 
@@ -10570,7 +10598,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -10800,9 +10828,9 @@ Other versions of this operator: <a href="Changelog.md#GreaterOrEqual-12">12</a>
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#GridSample-16">16</a>
+Other versions of this operator: <a href="Changelog.md#GridSample-16">16</a>, <a href="Changelog.md#GridSample-20">20</a>
 
 #### Attributes
 
@@ -10834,9 +10862,9 @@ Other versions of this operator: <a href="Changelog.md#GridSample-16">16</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
 <dd>Constrain input `X` and output `Y` types to all tensor types.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T2</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain grid types to float tensors.</dd>
 </dl>
 
@@ -11757,9 +11785,9 @@ expect(node, inputs=[size], outputs=[y], name="test_hannwindow_symmetric")
 
 #### Version
 
-This version of the operator has been available since version 6 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#HardSigmoid-1">1</a>
+Other versions of this operator: <a href="Changelog.md#HardSigmoid-1">1</a>, <a href="Changelog.md#HardSigmoid-6">6</a>
 
 #### Attributes
 
@@ -11787,7 +11815,7 @@ Other versions of this operator: <a href="Changelog.md#HardSigmoid-1">1</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -11841,7 +11869,9 @@ expect(node, inputs=[x], outputs=[y], name="test_hardsigmoid_default")
 
 #### Version
 
-This version of the operator has been available since version 14 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#HardSwish-14">14</a>
 
 #### Inputs
 
@@ -11860,7 +11890,7 @@ This version of the operator has been available since version 14 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -12708,9 +12738,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 6 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#InstanceNormalization-1">1</a>
+Other versions of this operator: <a href="Changelog.md#InstanceNormalization-1">1</a>, <a href="Changelog.md#InstanceNormalization-6">6</a>
 
 #### Attributes
 
@@ -12740,7 +12770,7 @@ Other versions of this operator: <a href="Changelog.md#InstanceNormalization-1">
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -13161,9 +13191,9 @@ expect(node, inputs=[x], outputs=[y], name="test_lrn")
 
 #### Version
 
-This version of the operator has been available since version 14 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#LSTM-1">1</a>, <a href="Changelog.md#LSTM-7">7</a>
+Other versions of this operator: <a href="Changelog.md#LSTM-1">1</a>, <a href="Changelog.md#LSTM-7">7</a>, <a href="Changelog.md#LSTM-14">14</a>
 
 #### Attributes
 
@@ -13221,7 +13251,7 @@ Other versions of this operator: <a href="Changelog.md#LSTM-1">1</a>, <a href="C
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>T1</tt> : tensor(int32)</dt>
 <dd>Constrain seq_lens to integer tensor.</dd>
@@ -14705,7 +14735,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#LpNormalization-1">1</a>
 
 #### Attributes
 
@@ -14733,7 +14765,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -14766,9 +14798,9 @@ This version of the operator has been available since version 1 of the default O
 
 #### Version
 
-This version of the operator has been available since version 18 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#LpPool-1">1</a>, <a href="Changelog.md#LpPool-2">2</a>, <a href="Changelog.md#LpPool-11">11</a>
+Other versions of this operator: <a href="Changelog.md#LpPool-1">1</a>, <a href="Changelog.md#LpPool-2">2</a>, <a href="Changelog.md#LpPool-11">11</a>, <a href="Changelog.md#LpPool-18">18</a>
 
 #### Attributes
 
@@ -14806,7 +14838,7 @@ Other versions of this operator: <a href="Changelog.md#LpPool-1">1</a>, <a href=
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -15425,9 +15457,9 @@ for op_dtype in all_numeric_dtypes:
 
 #### Version
 
-This version of the operator has been available since version 12 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#MaxPool-1">1</a>, <a href="Changelog.md#MaxPool-8">8</a>, <a href="Changelog.md#MaxPool-10">10</a>, <a href="Changelog.md#MaxPool-11">11</a>
+Other versions of this operator: <a href="Changelog.md#MaxPool-1">1</a>, <a href="Changelog.md#MaxPool-8">8</a>, <a href="Changelog.md#MaxPool-10">10</a>, <a href="Changelog.md#MaxPool-11">11</a>, <a href="Changelog.md#MaxPool-12">12</a>
 
 #### Attributes
 
@@ -15467,7 +15499,7 @@ Other versions of this operator: <a href="Changelog.md#MaxPool-1">1</a>, <a href
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(uint8)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(uint8)</dt>
 <dd>Constrain input and output types to float and 8 bit tensors.</dd>
 <dt><tt>I</tt> : tensor(int64)</dt>
 <dd>Constrain index tensor to int64</dd>
@@ -16294,7 +16326,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#MaxRoiPool-1">1</a>
 
 #### Attributes
 
@@ -16324,7 +16358,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -16352,9 +16386,9 @@ This version of the operator has been available since version 1 of the default O
 
 #### Version
 
-This version of the operator has been available since version 11 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#MaxUnpool-9">9</a>
+Other versions of this operator: <a href="Changelog.md#MaxUnpool-9">9</a>, <a href="Changelog.md#MaxUnpool-11">11</a>
 
 #### Attributes
 
@@ -16388,7 +16422,7 @@ Other versions of this operator: <a href="Changelog.md#MaxUnpool-9">9</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>T2</tt> : tensor(int64)</dt>
 <dd>Constrain index tensor to int64</dd>
@@ -16881,7 +16915,9 @@ for op_dtype in all_numeric_dtypes:
 
 #### Version
 
-This version of the operator has been available since version 18 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Mish-18">18</a>
 
 #### Inputs
 
@@ -16900,7 +16936,7 @@ This version of the operator has been available since version 18 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input X and output types to float tensors.</dd>
 </dl>
 
@@ -17317,7 +17353,9 @@ expect(node, inputs=[x, y], outputs=[z], name="test_mul_bcast")
 
 #### Version
 
-This version of the operator has been available since version 7 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Multinomial-7">7</a>
 
 #### Attributes
 
@@ -17347,7 +17385,7 @@ This version of the operator has been available since version 7 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input types to float tensors.</dd>
 <dt><tt>T2</tt> : tensor(int32), tensor(int64)</dt>
 <dd>Constrain output types to integral tensors.</dd>
@@ -17518,9 +17556,9 @@ expect(node, inputs=[x], outputs=[y], name="test_neg")
 
 #### Version
 
-This version of the operator has been available since version 13 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#NegativeLogLikelihoodLoss-12">12</a>
+Other versions of this operator: <a href="Changelog.md#NegativeLogLikelihoodLoss-12">12</a>, <a href="Changelog.md#NegativeLogLikelihoodLoss-13">13</a>
 
 #### Attributes
 
@@ -17552,7 +17590,7 @@ Other versions of this operator: <a href="Changelog.md#NegativeLogLikelihoodLoss
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input, weight, and output types to floating-point tensors.</dd>
 <dt><tt>Tind</tt> : tensor(int32), tensor(int64)</dt>
 <dd>Constrain target to integer types</dd>
@@ -20801,9 +20839,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 14 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#RNN-1">1</a>, <a href="Changelog.md#RNN-7">7</a>
+Other versions of this operator: <a href="Changelog.md#RNN-1">1</a>, <a href="Changelog.md#RNN-7">7</a>, <a href="Changelog.md#RNN-14">14</a>
 
 #### Attributes
 
@@ -20853,7 +20891,7 @@ Other versions of this operator: <a href="Changelog.md#RNN-1">1</a>, <a href="Ch
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>T1</tt> : tensor(int32)</dt>
 <dd>Constrain seq_lens to integer tensor.</dd>
@@ -21022,7 +21060,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#RandomNormal-1">1</a>
 
 #### Attributes
 
@@ -21052,7 +21092,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain output types to float tensors.</dd>
 </dl>
 
@@ -21069,7 +21109,9 @@ This version of the operator has been available since version 1 of the default O
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#RandomNormalLike-1">1</a>
 
 #### Attributes
 
@@ -21101,9 +21143,9 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
 <dd>Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T2</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain output types to float tensors.</dd>
 </dl>
 
@@ -21119,7 +21161,9 @@ This version of the operator has been available since version 1 of the default O
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#RandomUniform-1">1</a>
 
 #### Attributes
 
@@ -21149,7 +21193,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain output types to float tensors.</dd>
 </dl>
 
@@ -21166,7 +21210,9 @@ This version of the operator has been available since version 1 of the default O
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#RandomUniformLike-1">1</a>
 
 #### Attributes
 
@@ -21198,9 +21244,9 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
 <dd>Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T2</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain output types to float tensors.</dd>
 </dl>
 
@@ -26592,9 +26638,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 16 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#RoiAlign-10">10</a>
+Other versions of this operator: <a href="Changelog.md#RoiAlign-10">10</a>, <a href="Changelog.md#RoiAlign-16">16</a>
 
 #### Attributes
 
@@ -26634,7 +26680,7 @@ Other versions of this operator: <a href="Changelog.md#RoiAlign-10">10</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T1</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain types to float tensors.</dd>
 <dt><tt>T2</tt> : tensor(int64)</dt>
 <dd>Constrain types to int tensors.</dd>
@@ -26979,7 +27025,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 11 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Round-11">11</a>
 
 #### Inputs
 
@@ -26998,7 +27046,7 @@ This version of the operator has been available since version 11 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -28224,9 +28272,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 6 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Selu-1">1</a>
+Other versions of this operator: <a href="Changelog.md#Selu-1">1</a>, <a href="Changelog.md#Selu-6">6</a>
 
 #### Attributes
 
@@ -28254,7 +28302,7 @@ Other versions of this operator: <a href="Changelog.md#Selu-1">1</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -29280,7 +29328,9 @@ expect(node, inputs=[x], outputs=[y], name="test_sign")
 
 #### Version
 
-This version of the operator has been available since version 7 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Sin-7">7</a>
 
 #### Inputs
 
@@ -29299,7 +29349,7 @@ This version of the operator has been available since version 7 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -29334,7 +29384,9 @@ expect(node, inputs=[x], outputs=[y], name="test_sin")
 
 #### Version
 
-This version of the operator has been available since version 9 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Sinh-9">9</a>
 
 #### Inputs
 
@@ -29353,7 +29405,7 @@ This version of the operator has been available since version 9 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -31263,7 +31315,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Softplus-1">1</a>
 
 #### Inputs
 
@@ -31282,7 +31336,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -31319,7 +31373,9 @@ expect(node, inputs=[x], outputs=[y], name="test_softplus")
 
 #### Version
 
-This version of the operator has been available since version 1 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Softsign-1">1</a>
 
 #### Inputs
 
@@ -31338,7 +31394,7 @@ This version of the operator has been available since version 1 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -32938,7 +32994,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 7 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#Tan-7">7</a>
 
 #### Inputs
 
@@ -32957,7 +33015,7 @@ This version of the operator has been available since version 7 of the default O
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -33013,7 +33071,7 @@ Other versions of this operator: <a href="Changelog.md#Tanh-1">1</a>, <a href="C
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
@@ -33377,7 +33435,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 10 of the default ONNX operator set.
+This version of the operator has been available since version 22 of the default ONNX operator set.
+
+Other versions of this operator: <a href="Changelog.md#ThresholdedRelu-10">10</a>
 
 #### Attributes
 
@@ -33403,7 +33463,7 @@ This version of the operator has been available since version 10 of the default
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(bfloat16), tensor(float16), tensor(float), tensor(double)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 </dl>
 
diff --git a/onnx/backend/test/data/node/test_acos/model.onnx b/onnx/backend/test/data/node/test_acos/model.onnx
index cc7c695f558..206ab391191 100644
Binary files a/onnx/backend/test/data/node/test_acos/model.onnx and b/onnx/backend/test/data/node/test_acos/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_acos_example/model.onnx b/onnx/backend/test/data/node/test_acos_example/model.onnx
index c986dd6ba90..8b29942f13f 100644
Binary files a/onnx/backend/test/data/node/test_acos_example/model.onnx and b/onnx/backend/test/data/node/test_acos_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_acosh/model.onnx b/onnx/backend/test/data/node/test_acosh/model.onnx
index 9651625d11c..916010c59b8 100644
Binary files a/onnx/backend/test/data/node/test_acosh/model.onnx and b/onnx/backend/test/data/node/test_acosh/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_acosh_example/model.onnx b/onnx/backend/test/data/node/test_acosh_example/model.onnx
index 42eb9a1641a..1fbfea14d9c 100644
Binary files a/onnx/backend/test/data/node/test_acosh_example/model.onnx and b/onnx/backend/test/data/node/test_acosh_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_asin/model.onnx b/onnx/backend/test/data/node/test_asin/model.onnx
index a7caca1b1a5..e7d98048784 100644
Binary files a/onnx/backend/test/data/node/test_asin/model.onnx and b/onnx/backend/test/data/node/test_asin/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_asin_example/model.onnx b/onnx/backend/test/data/node/test_asin_example/model.onnx
index a20be582129..22ddea64b64 100644
Binary files a/onnx/backend/test/data/node/test_asin_example/model.onnx and b/onnx/backend/test/data/node/test_asin_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_asinh/model.onnx b/onnx/backend/test/data/node/test_asinh/model.onnx
index 922c0aef88e..96d1db46bdf 100644
Binary files a/onnx/backend/test/data/node/test_asinh/model.onnx and b/onnx/backend/test/data/node/test_asinh/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_asinh_example/model.onnx b/onnx/backend/test/data/node/test_asinh_example/model.onnx
index 7d02941a675..eb477884976 100644
Binary files a/onnx/backend/test/data/node/test_asinh_example/model.onnx and b/onnx/backend/test/data/node/test_asinh_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_atan/model.onnx b/onnx/backend/test/data/node/test_atan/model.onnx
index 7cd0e8e0416..4f2f5deee1d 100644
Binary files a/onnx/backend/test/data/node/test_atan/model.onnx and b/onnx/backend/test/data/node/test_atan/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_atan_example/model.onnx b/onnx/backend/test/data/node/test_atan_example/model.onnx
index 7b9525ee935..bfe9894a60d 100644
Binary files a/onnx/backend/test/data/node/test_atan_example/model.onnx and b/onnx/backend/test/data/node/test_atan_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_atanh/model.onnx b/onnx/backend/test/data/node/test_atanh/model.onnx
index 3d1716d23a5..a1db102d3e7 100644
Binary files a/onnx/backend/test/data/node/test_atanh/model.onnx and b/onnx/backend/test/data/node/test_atanh/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_atanh_example/model.onnx b/onnx/backend/test/data/node/test_atanh_example/model.onnx
index 4e6d72d6a75..d0efa30625f 100644
Binary files a/onnx/backend/test/data/node/test_atanh_example/model.onnx and b/onnx/backend/test/data/node/test_atanh_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_1d_default/model.onnx b/onnx/backend/test/data/node/test_averagepool_1d_default/model.onnx
index 14c0309706c..36f6b96a29b 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_1d_default/model.onnx and b/onnx/backend/test/data/node/test_averagepool_1d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_ceil/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_ceil/model.onnx
index bb02ae58ca6..168a69574f5 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_ceil/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_ceil/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_default/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_default/model.onnx
index bd1efdecb29..11a1b13ef75 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_default/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_dilations/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_dilations/model.onnx
index 7de2313225e..ce3c86ed527 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_dilations/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_dilations/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_pads/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_pads/model.onnx
index 2ed0e2e202d..7d5e02e1941 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_pads/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_pads/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_pads_count_include_pad/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_pads_count_include_pad/model.onnx
index af266d7ac1a..4157c3ad902 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_pads_count_include_pad/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_pads_count_include_pad/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads/model.onnx
index 08b517aabfe..56e616d6457 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads_count_include_pad/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads_count_include_pad/model.onnx
index bdd8f23928c..2c4c4d66758 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads_count_include_pad/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_pads_count_include_pad/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_same_upper/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_same_upper/model.onnx
index c02c68b069a..413ec8d0ba5 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_same_upper/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_same_upper/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_strides/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_strides/model.onnx
index d1d510ea42f..b77b84e4686 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_precomputed_strides/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_precomputed_strides/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_same_lower/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_same_lower/model.onnx
index eba0a5ff98e..75cfab55e13 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_same_lower/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_same_lower/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_same_upper/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_same_upper/model.onnx
index 2b3f8850c26..102b40fc388 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_same_upper/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_same_upper/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_2d_strides/model.onnx b/onnx/backend/test/data/node/test_averagepool_2d_strides/model.onnx
index 45920377ed5..71f2ba21560 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_2d_strides/model.onnx and b/onnx/backend/test/data/node/test_averagepool_2d_strides/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_3d_default/model.onnx b/onnx/backend/test/data/node/test_averagepool_3d_default/model.onnx
index 638a15930cd..4d1bcc84eb0 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_3d_default/model.onnx and b/onnx/backend/test/data/node/test_averagepool_3d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_False/model.onnx b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_False/model.onnx
index a46212cff76..7a757250d57 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_False/model.onnx and b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_False/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_True/model.onnx b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_True/model.onnx
index 2354fbfb7b1..894e85fdd48 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_True/model.onnx and b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_True/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_False/model.onnx b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_False/model.onnx
index 79a25a22420..eafe7769f21 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_False/model.onnx and b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_False/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True/model.onnx b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True/model.onnx
index 310be73f538..27f6fb1f595 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True/model.onnx and b/onnx/backend/test/data/node/test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_averagepool_3d_dilations_small/model.onnx b/onnx/backend/test/data/node/test_averagepool_3d_dilations_small/model.onnx
index 7a36e26fce9..883cdbefa60 100644
Binary files a/onnx/backend/test/data/node/test_averagepool_3d_dilations_small/model.onnx and b/onnx/backend/test/data/node/test_averagepool_3d_dilations_small/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_basic_conv_with_padding/model.onnx b/onnx/backend/test/data/node/test_basic_conv_with_padding/model.onnx
index 061490a61a9..e7c3e41f3cf 100644
Binary files a/onnx/backend/test/data/node/test_basic_conv_with_padding/model.onnx and b/onnx/backend/test/data/node/test_basic_conv_with_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_basic_conv_without_padding/model.onnx b/onnx/backend/test/data/node/test_basic_conv_without_padding/model.onnx
index 09acab9dcf7..dbdb55f308b 100644
Binary files a/onnx/backend/test/data/node/test_basic_conv_without_padding/model.onnx and b/onnx/backend/test/data/node/test_basic_conv_without_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_basic_deform_conv_with_padding/model.onnx b/onnx/backend/test/data/node/test_basic_deform_conv_with_padding/model.onnx
index 2d040f02c9c..7729eeb1f26 100644
Binary files a/onnx/backend/test/data/node/test_basic_deform_conv_with_padding/model.onnx and b/onnx/backend/test/data/node/test_basic_deform_conv_with_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_basic_deform_conv_without_padding/model.onnx b/onnx/backend/test/data/node/test_basic_deform_conv_without_padding/model.onnx
index 5cbe5291d4c..1767e752c88 100644
Binary files a/onnx/backend/test/data/node/test_basic_deform_conv_without_padding/model.onnx and b/onnx/backend/test/data/node/test_basic_deform_conv_without_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_bernoulli/model.onnx b/onnx/backend/test/data/node/test_bernoulli/model.onnx
index 47b791d6885..702c9960fd4 100644
Binary files a/onnx/backend/test/data/node/test_bernoulli/model.onnx and b/onnx/backend/test/data/node/test_bernoulli/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_bernoulli_double/model.onnx b/onnx/backend/test/data/node/test_bernoulli_double/model.onnx
index f954a8dc57f..929e4196e3b 100644
Binary files a/onnx/backend/test/data/node/test_bernoulli_double/model.onnx and b/onnx/backend/test/data/node/test_bernoulli_double/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_bernoulli_double_expanded/model.onnx b/onnx/backend/test/data/node/test_bernoulli_double_expanded/model.onnx
index fcf50609026..558e38225b0 100644
Binary files a/onnx/backend/test/data/node/test_bernoulli_double_expanded/model.onnx and b/onnx/backend/test/data/node/test_bernoulli_double_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_bernoulli_expanded/model.onnx b/onnx/backend/test/data/node/test_bernoulli_expanded/model.onnx
index 785d12f46e8..c510c81bf60 100644
Binary files a/onnx/backend/test/data/node/test_bernoulli_expanded/model.onnx and b/onnx/backend/test/data/node/test_bernoulli_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_bernoulli_seed/model.onnx b/onnx/backend/test/data/node/test_bernoulli_seed/model.onnx
index 85f1b82b5e2..ea69f01240e 100644
Binary files a/onnx/backend/test/data/node/test_bernoulli_seed/model.onnx and b/onnx/backend/test/data/node/test_bernoulli_seed/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_bernoulli_seed_expanded/model.onnx b/onnx/backend/test/data/node/test_bernoulli_seed_expanded/model.onnx
index e195a6ce93c..3236cbfec81 100644
Binary files a/onnx/backend/test/data/node/test_bernoulli_seed_expanded/model.onnx and b/onnx/backend/test/data/node/test_bernoulli_seed_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_conv_with_autopad_same/model.onnx b/onnx/backend/test/data/node/test_conv_with_autopad_same/model.onnx
index 4033a6b6938..47dc4a25705 100644
Binary files a/onnx/backend/test/data/node/test_conv_with_autopad_same/model.onnx and b/onnx/backend/test/data/node/test_conv_with_autopad_same/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_conv_with_strides_and_asymmetric_padding/model.onnx b/onnx/backend/test/data/node/test_conv_with_strides_and_asymmetric_padding/model.onnx
index 144514526de..cfa8ad01787 100644
Binary files a/onnx/backend/test/data/node/test_conv_with_strides_and_asymmetric_padding/model.onnx and b/onnx/backend/test/data/node/test_conv_with_strides_and_asymmetric_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_conv_with_strides_no_padding/model.onnx b/onnx/backend/test/data/node/test_conv_with_strides_no_padding/model.onnx
index 1423e48c486..d4d1961718c 100644
Binary files a/onnx/backend/test/data/node/test_conv_with_strides_no_padding/model.onnx and b/onnx/backend/test/data/node/test_conv_with_strides_no_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_conv_with_strides_padding/model.onnx b/onnx/backend/test/data/node/test_conv_with_strides_padding/model.onnx
index 1282c6bdea6..adb8521473f 100644
Binary files a/onnx/backend/test/data/node/test_conv_with_strides_padding/model.onnx and b/onnx/backend/test/data/node/test_conv_with_strides_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose/model.onnx b/onnx/backend/test/data/node/test_convtranspose/model.onnx
index 3720252596a..302c7619ed1 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose/model.onnx and b/onnx/backend/test/data/node/test_convtranspose/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_1d/model.onnx b/onnx/backend/test/data/node/test_convtranspose_1d/model.onnx
index e703997baf7..a1b9bcabd41 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_1d/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_1d/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_3d/model.onnx b/onnx/backend/test/data/node/test_convtranspose_3d/model.onnx
index ebdba71bf6a..44bfb7b2995 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_3d/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_3d/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_autopad_same/model.onnx b/onnx/backend/test/data/node/test_convtranspose_autopad_same/model.onnx
index b65ac368787..cce8a8eccef 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_autopad_same/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_autopad_same/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_dilations/model.onnx b/onnx/backend/test/data/node/test_convtranspose_dilations/model.onnx
index 80779a90263..58e922a6b3d 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_dilations/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_dilations/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_kernel_shape/model.onnx b/onnx/backend/test/data/node/test_convtranspose_kernel_shape/model.onnx
index e0948814573..4d328e9cdf2 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_kernel_shape/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_kernel_shape/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_output_shape/model.onnx b/onnx/backend/test/data/node/test_convtranspose_output_shape/model.onnx
index 344a21a8c3c..8b9b86216a3 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_output_shape/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_output_shape/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_pad/model.onnx b/onnx/backend/test/data/node/test_convtranspose_pad/model.onnx
index bb593e66f07..b972e2653e2 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_pad/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_pad/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_convtranspose_pads/model.onnx b/onnx/backend/test/data/node/test_convtranspose_pads/model.onnx
index b17ff879e67..1e7116447cb 100644
Binary files a/onnx/backend/test/data/node/test_convtranspose_pads/model.onnx and b/onnx/backend/test/data/node/test_convtranspose_pads/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_cos/model.onnx b/onnx/backend/test/data/node/test_cos/model.onnx
index 5ec0d0c3247..9e1260acfac 100644
Binary files a/onnx/backend/test/data/node/test_cos/model.onnx and b/onnx/backend/test/data/node/test_cos/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_cos_example/model.onnx b/onnx/backend/test/data/node/test_cos_example/model.onnx
index 6726ce0230a..b74fcbe66e2 100644
Binary files a/onnx/backend/test/data/node/test_cos_example/model.onnx and b/onnx/backend/test/data/node/test_cos_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_cosh/model.onnx b/onnx/backend/test/data/node/test_cosh/model.onnx
index 68d88a42c03..e74ac53653e 100644
Binary files a/onnx/backend/test/data/node/test_cosh/model.onnx and b/onnx/backend/test/data/node/test_cosh/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_cosh_example/model.onnx b/onnx/backend/test/data/node/test_cosh_example/model.onnx
index db9eec4d051..7dc7ab4dc5f 100644
Binary files a/onnx/backend/test/data/node/test_cosh_example/model.onnx and b/onnx/backend/test/data/node/test_cosh_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_deform_conv_with_mask_bias/model.onnx b/onnx/backend/test/data/node/test_deform_conv_with_mask_bias/model.onnx
index 3f5948c98a2..a48939459b2 100644
Binary files a/onnx/backend/test/data/node/test_deform_conv_with_mask_bias/model.onnx and b/onnx/backend/test/data/node/test_deform_conv_with_mask_bias/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_deform_conv_with_multiple_offset_groups/model.onnx b/onnx/backend/test/data/node/test_deform_conv_with_multiple_offset_groups/model.onnx
index 2adbaa41881..bb2b554d5b0 100644
Binary files a/onnx/backend/test/data/node/test_deform_conv_with_multiple_offset_groups/model.onnx and b/onnx/backend/test/data/node/test_deform_conv_with_multiple_offset_groups/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_det_2d/model.onnx b/onnx/backend/test/data/node/test_det_2d/model.onnx
index c1ceae61c0e..b2f1c52802f 100644
Binary files a/onnx/backend/test/data/node/test_det_2d/model.onnx and b/onnx/backend/test/data/node/test_det_2d/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_det_nd/model.onnx b/onnx/backend/test/data/node/test_det_nd/model.onnx
index 07a153a1c58..26d4c871da4 100644
Binary files a/onnx/backend/test/data/node/test_det_nd/model.onnx and b/onnx/backend/test/data/node/test_det_nd/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_dropout_default/model.onnx b/onnx/backend/test/data/node/test_dropout_default/model.onnx
index 184e4485190..be75e7dbb46 100644
Binary files a/onnx/backend/test/data/node/test_dropout_default/model.onnx and b/onnx/backend/test/data/node/test_dropout_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_dropout_default_mask/model.onnx b/onnx/backend/test/data/node/test_dropout_default_mask/model.onnx
index ef7b9fa8e33..f45cd7a6080 100644
Binary files a/onnx/backend/test/data/node/test_dropout_default_mask/model.onnx and b/onnx/backend/test/data/node/test_dropout_default_mask/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_dropout_default_mask_ratio/model.onnx b/onnx/backend/test/data/node/test_dropout_default_mask_ratio/model.onnx
index 5b1f0707495..343482d6c83 100644
Binary files a/onnx/backend/test/data/node/test_dropout_default_mask_ratio/model.onnx and b/onnx/backend/test/data/node/test_dropout_default_mask_ratio/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_dropout_default_ratio/model.onnx b/onnx/backend/test/data/node/test_dropout_default_ratio/model.onnx
index 8deb0febfd3..3f079ba4621 100644
Binary files a/onnx/backend/test/data/node/test_dropout_default_ratio/model.onnx and b/onnx/backend/test/data/node/test_dropout_default_ratio/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_elu/model.onnx b/onnx/backend/test/data/node/test_elu/model.onnx
index 141d895bba6..dd1003b7c4c 100644
Binary files a/onnx/backend/test/data/node/test_elu/model.onnx and b/onnx/backend/test/data/node/test_elu/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_elu_default/model.onnx b/onnx/backend/test/data/node/test_elu_default/model.onnx
index 4007dc33688..3a979710e02 100644
Binary files a/onnx/backend/test/data/node/test_elu_default/model.onnx and b/onnx/backend/test/data/node/test_elu_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_elu_example/model.onnx b/onnx/backend/test/data/node/test_elu_example/model.onnx
index 9ea0c1df665..99091bb5134 100644
Binary files a/onnx/backend/test/data/node/test_elu_example/model.onnx and b/onnx/backend/test/data/node/test_elu_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_eyelike_populate_off_main_diagonal/model.onnx b/onnx/backend/test/data/node/test_eyelike_populate_off_main_diagonal/model.onnx
index 0b98de4160d..60a068b8e20 100644
Binary files a/onnx/backend/test/data/node/test_eyelike_populate_off_main_diagonal/model.onnx and b/onnx/backend/test/data/node/test_eyelike_populate_off_main_diagonal/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_eyelike_with_dtype/model.onnx b/onnx/backend/test/data/node/test_eyelike_with_dtype/model.onnx
index ca799fb505d..8599bfdb16a 100644
Binary files a/onnx/backend/test/data/node/test_eyelike_with_dtype/model.onnx and b/onnx/backend/test/data/node/test_eyelike_with_dtype/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_eyelike_without_dtype/model.onnx b/onnx/backend/test/data/node/test_eyelike_without_dtype/model.onnx
index 6c7db0b132a..584a3328605 100644
Binary files a/onnx/backend/test/data/node/test_eyelike_without_dtype/model.onnx and b/onnx/backend/test/data/node/test_eyelike_without_dtype/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_globalaveragepool/model.onnx b/onnx/backend/test/data/node/test_globalaveragepool/model.onnx
index f270439c98f..1441d1ec365 100644
Binary files a/onnx/backend/test/data/node/test_globalaveragepool/model.onnx and b/onnx/backend/test/data/node/test_globalaveragepool/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_globalaveragepool_precomputed/model.onnx b/onnx/backend/test/data/node/test_globalaveragepool_precomputed/model.onnx
index e2100fe8ce6..de1544e9697 100644
Binary files a/onnx/backend/test/data/node/test_globalaveragepool_precomputed/model.onnx and b/onnx/backend/test/data/node/test_globalaveragepool_precomputed/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_globalmaxpool/model.onnx b/onnx/backend/test/data/node/test_globalmaxpool/model.onnx
index d641ee8bddb..007fe816ba5 100644
Binary files a/onnx/backend/test/data/node/test_globalmaxpool/model.onnx and b/onnx/backend/test/data/node/test_globalmaxpool/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_globalmaxpool_precomputed/model.onnx b/onnx/backend/test/data/node/test_globalmaxpool_precomputed/model.onnx
index d1cac7c23be..69ac33125c2 100644
Binary files a/onnx/backend/test/data/node/test_globalmaxpool_precomputed/model.onnx and b/onnx/backend/test/data/node/test_globalmaxpool_precomputed/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample/model.onnx b/onnx/backend/test/data/node/test_gridsample/model.onnx
index 334b8cca0ab..1d17074133e 100644
Binary files a/onnx/backend/test/data/node/test_gridsample/model.onnx and b/onnx/backend/test/data/node/test_gridsample/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_aligncorners_true/model.onnx b/onnx/backend/test/data/node/test_gridsample_aligncorners_true/model.onnx
index 40b0e8b6fab..6def3c0df96 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_aligncorners_true/model.onnx and b/onnx/backend/test/data/node/test_gridsample_aligncorners_true/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_bicubic/model.onnx b/onnx/backend/test/data/node/test_gridsample_bicubic/model.onnx
index 14cb8486881..307d6519556 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_bicubic/model.onnx and b/onnx/backend/test/data/node/test_gridsample_bicubic/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_0_additional_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_0_additional_1/model.onnx
index ec7876ecf27..cf230353d27 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_0_additional_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_0_additional_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_1_additional_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_1_additional_1/model.onnx
index 19f37d04523..a617350a9af 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_1_additional_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_bicubic_align_corners_1_additional_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_bilinear/model.onnx b/onnx/backend/test/data/node/test_gridsample_bilinear/model.onnx
index af5a0847336..bb1b2ebd769 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_bilinear/model.onnx and b/onnx/backend/test/data/node/test_gridsample_bilinear/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_0_additional_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_0_additional_1/model.onnx
index 188342495b8..07ac23f070b 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_0_additional_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_0_additional_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_1_additional_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_1_additional_1/model.onnx
index e5686c3c37d..c17ad25317b 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_1_additional_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_bilinear_align_corners_1_additional_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_border_padding/model.onnx b/onnx/backend/test/data/node/test_gridsample_border_padding/model.onnx
index 01fbcef9853..3c548c90ff6 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_border_padding/model.onnx and b/onnx/backend/test/data/node/test_gridsample_border_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_nearest/model.onnx b/onnx/backend/test/data/node/test_gridsample_nearest/model.onnx
index bacb4086e0d..294090833db 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_nearest/model.onnx and b/onnx/backend/test/data/node/test_gridsample_nearest/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_0_additional_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_0_additional_1/model.onnx
index 2003d009b6d..245dcbfba8a 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_0_additional_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_0_additional_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_1_additional_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_1_additional_1/model.onnx
index 2a105639f0d..de22147ab35 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_1_additional_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_nearest_align_corners_1_additional_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_reflection_padding/model.onnx b/onnx/backend/test/data/node/test_gridsample_reflection_padding/model.onnx
index 9c7f5cf9ab9..203f1434d34 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_reflection_padding/model.onnx and b/onnx/backend/test/data/node/test_gridsample_reflection_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_0/model.onnx b/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_0/model.onnx
index 02d940fb147..7f90268223a 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_0/model.onnx and b/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_0/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_1/model.onnx
index af0b03cd424..e3781ffb3b6 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_volumetric_bilinear_align_corners_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_0/model.onnx b/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_0/model.onnx
index d87cce0017b..706252dc073 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_0/model.onnx and b/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_0/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_1/model.onnx b/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_1/model.onnx
index dd789c06a01..2eb750c678e 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_1/model.onnx and b/onnx/backend/test/data/node/test_gridsample_volumetric_nearest_align_corners_1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gridsample_zeros_padding/model.onnx b/onnx/backend/test/data/node/test_gridsample_zeros_padding/model.onnx
index fb2b33369d9..7ed92ef5311 100644
Binary files a/onnx/backend/test/data/node/test_gridsample_zeros_padding/model.onnx and b/onnx/backend/test/data/node/test_gridsample_zeros_padding/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gru_batchwise/model.onnx b/onnx/backend/test/data/node/test_gru_batchwise/model.onnx
index 8d50456cb68..6e8a50d56f3 100644
Binary files a/onnx/backend/test/data/node/test_gru_batchwise/model.onnx and b/onnx/backend/test/data/node/test_gru_batchwise/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gru_defaults/model.onnx b/onnx/backend/test/data/node/test_gru_defaults/model.onnx
index 36fcba152af..07689ef4137 100644
Binary files a/onnx/backend/test/data/node/test_gru_defaults/model.onnx and b/onnx/backend/test/data/node/test_gru_defaults/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gru_seq_length/model.onnx b/onnx/backend/test/data/node/test_gru_seq_length/model.onnx
index 2c9c2489c53..5f0369e7530 100644
Binary files a/onnx/backend/test/data/node/test_gru_seq_length/model.onnx and b/onnx/backend/test/data/node/test_gru_seq_length/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_gru_with_initial_bias/model.onnx b/onnx/backend/test/data/node/test_gru_with_initial_bias/model.onnx
index db0c7de9456..d87b797d815 100644
Binary files a/onnx/backend/test/data/node/test_gru_with_initial_bias/model.onnx and b/onnx/backend/test/data/node/test_gru_with_initial_bias/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_hardsigmoid/model.onnx b/onnx/backend/test/data/node/test_hardsigmoid/model.onnx
index b330b5ce914..5070351948d 100644
Binary files a/onnx/backend/test/data/node/test_hardsigmoid/model.onnx and b/onnx/backend/test/data/node/test_hardsigmoid/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_hardsigmoid_default/model.onnx b/onnx/backend/test/data/node/test_hardsigmoid_default/model.onnx
index 2672c13398e..d0e57f315b1 100644
Binary files a/onnx/backend/test/data/node/test_hardsigmoid_default/model.onnx and b/onnx/backend/test/data/node/test_hardsigmoid_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_hardsigmoid_example/model.onnx b/onnx/backend/test/data/node/test_hardsigmoid_example/model.onnx
index 4c1564a09ef..ffe09d493db 100644
Binary files a/onnx/backend/test/data/node/test_hardsigmoid_example/model.onnx and b/onnx/backend/test/data/node/test_hardsigmoid_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_hardswish/model.onnx b/onnx/backend/test/data/node/test_hardswish/model.onnx
index f72506caa0f..0ed0b321208 100644
Binary files a/onnx/backend/test/data/node/test_hardswish/model.onnx and b/onnx/backend/test/data/node/test_hardswish/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_hardswish_expanded/model.onnx b/onnx/backend/test/data/node/test_hardswish_expanded/model.onnx
index ecfe4f31ba5..abe72af6be2 100644
Binary files a/onnx/backend/test/data/node/test_hardswish_expanded/model.onnx and b/onnx/backend/test/data/node/test_hardswish_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_image_decoder_decode_jpeg2k_rgb/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_image_decoder_decode_jpeg2k_rgb/test_data_set_0/input_0.pb
index 66db10c971f..5f4cb90f28a 100644
Binary files a/onnx/backend/test/data/node/test_image_decoder_decode_jpeg2k_rgb/test_data_set_0/input_0.pb and b/onnx/backend/test/data/node/test_image_decoder_decode_jpeg2k_rgb/test_data_set_0/input_0.pb differ
diff --git a/onnx/backend/test/data/node/test_instancenorm_epsilon/model.onnx b/onnx/backend/test/data/node/test_instancenorm_epsilon/model.onnx
index d6ac084c975..9cf84c252d0 100644
Binary files a/onnx/backend/test/data/node/test_instancenorm_epsilon/model.onnx and b/onnx/backend/test/data/node/test_instancenorm_epsilon/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_instancenorm_example/model.onnx b/onnx/backend/test/data/node/test_instancenorm_example/model.onnx
index 37a901970a2..ae190ddd654 100644
Binary files a/onnx/backend/test/data/node/test_instancenorm_example/model.onnx and b/onnx/backend/test/data/node/test_instancenorm_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_1d_default/model.onnx b/onnx/backend/test/data/node/test_lppool_1d_default/model.onnx
index 8734aa23c9c..d3d882afbed 100644
Binary files a/onnx/backend/test/data/node/test_lppool_1d_default/model.onnx and b/onnx/backend/test/data/node/test_lppool_1d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_2d_default/model.onnx b/onnx/backend/test/data/node/test_lppool_2d_default/model.onnx
index 6d8e9eef7f9..0cd55f03c51 100644
Binary files a/onnx/backend/test/data/node/test_lppool_2d_default/model.onnx and b/onnx/backend/test/data/node/test_lppool_2d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_2d_dilations/model.onnx b/onnx/backend/test/data/node/test_lppool_2d_dilations/model.onnx
index 503ccff12b9..7970480a7df 100644
Binary files a/onnx/backend/test/data/node/test_lppool_2d_dilations/model.onnx and b/onnx/backend/test/data/node/test_lppool_2d_dilations/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_2d_pads/model.onnx b/onnx/backend/test/data/node/test_lppool_2d_pads/model.onnx
index 73121feab9b..4d01b667e63 100644
Binary files a/onnx/backend/test/data/node/test_lppool_2d_pads/model.onnx and b/onnx/backend/test/data/node/test_lppool_2d_pads/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_2d_same_lower/model.onnx b/onnx/backend/test/data/node/test_lppool_2d_same_lower/model.onnx
index dbc14bf1a9b..00afca88cc1 100644
Binary files a/onnx/backend/test/data/node/test_lppool_2d_same_lower/model.onnx and b/onnx/backend/test/data/node/test_lppool_2d_same_lower/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_2d_same_upper/model.onnx b/onnx/backend/test/data/node/test_lppool_2d_same_upper/model.onnx
index 3216a2a2936..108b05ae157 100644
Binary files a/onnx/backend/test/data/node/test_lppool_2d_same_upper/model.onnx and b/onnx/backend/test/data/node/test_lppool_2d_same_upper/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_2d_strides/model.onnx b/onnx/backend/test/data/node/test_lppool_2d_strides/model.onnx
index fa7064f299b..0ddc7b602cb 100644
Binary files a/onnx/backend/test/data/node/test_lppool_2d_strides/model.onnx and b/onnx/backend/test/data/node/test_lppool_2d_strides/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lppool_3d_default/model.onnx b/onnx/backend/test/data/node/test_lppool_3d_default/model.onnx
index e8e653d2838..becb512e64d 100644
Binary files a/onnx/backend/test/data/node/test_lppool_3d_default/model.onnx and b/onnx/backend/test/data/node/test_lppool_3d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lstm_batchwise/model.onnx b/onnx/backend/test/data/node/test_lstm_batchwise/model.onnx
index a10fa1840df..29284301f78 100644
Binary files a/onnx/backend/test/data/node/test_lstm_batchwise/model.onnx and b/onnx/backend/test/data/node/test_lstm_batchwise/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lstm_defaults/model.onnx b/onnx/backend/test/data/node/test_lstm_defaults/model.onnx
index 7e6a4af6806..a7bcc594141 100644
Binary files a/onnx/backend/test/data/node/test_lstm_defaults/model.onnx and b/onnx/backend/test/data/node/test_lstm_defaults/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lstm_with_initial_bias/model.onnx b/onnx/backend/test/data/node/test_lstm_with_initial_bias/model.onnx
index 1574652ffda..caba9ced475 100644
Binary files a/onnx/backend/test/data/node/test_lstm_with_initial_bias/model.onnx and b/onnx/backend/test/data/node/test_lstm_with_initial_bias/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_lstm_with_peepholes/model.onnx b/onnx/backend/test/data/node/test_lstm_with_peepholes/model.onnx
index c5e3f974c4f..21a905342a2 100644
Binary files a/onnx/backend/test/data/node/test_lstm_with_peepholes/model.onnx and b/onnx/backend/test/data/node/test_lstm_with_peepholes/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_1d_default/model.onnx b/onnx/backend/test/data/node/test_maxpool_1d_default/model.onnx
index d72de35c8e7..dba3be8456f 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_1d_default/model.onnx and b/onnx/backend/test/data/node/test_maxpool_1d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_ceil/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_ceil/model.onnx
index 7515f602fcb..4bb1e32b06d 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_ceil/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_ceil/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_ceil_output_size_reduce_by_one/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_ceil_output_size_reduce_by_one/model.onnx
index 0c136bf91b9..c0b1842bfed 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_ceil_output_size_reduce_by_one/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_ceil_output_size_reduce_by_one/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_default/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_default/model.onnx
index 820d4fdc4c4..64c4aff7171 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_default/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_dilations/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_dilations/model.onnx
index 19140d87016..3c304ff4aa4 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_dilations/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_dilations/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_pads/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_pads/model.onnx
index a917b6a9094..98cbdbe0cc2 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_pads/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_pads/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_precomputed_pads/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_precomputed_pads/model.onnx
index 656f49679ed..331007ddad9 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_precomputed_pads/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_precomputed_pads/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_precomputed_same_upper/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_precomputed_same_upper/model.onnx
index dc39dd1813f..94ab65992fc 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_precomputed_same_upper/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_precomputed_same_upper/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_precomputed_strides/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_precomputed_strides/model.onnx
index 53d8d72a344..3175fabc99f 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_precomputed_strides/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_precomputed_strides/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_same_lower/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_same_lower/model.onnx
index 42cd27dcbe0..d972e13bfab 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_same_lower/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_same_lower/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_same_upper/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_same_upper/model.onnx
index b282f64679d..0a4514be963 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_same_upper/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_same_upper/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_strides/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_strides/model.onnx
index 10a2ac5f1dc..ad67b18f51a 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_strides/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_strides/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_2d_uint8/model.onnx b/onnx/backend/test/data/node/test_maxpool_2d_uint8/model.onnx
index f3e38809441..8f4fe785561 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_2d_uint8/model.onnx and b/onnx/backend/test/data/node/test_maxpool_2d_uint8/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_3d_default/model.onnx b/onnx/backend/test/data/node/test_maxpool_3d_default/model.onnx
index 15d6e4c265c..c4e27bb94eb 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_3d_default/model.onnx and b/onnx/backend/test/data/node/test_maxpool_3d_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_3d_dilations/model.onnx b/onnx/backend/test/data/node/test_maxpool_3d_dilations/model.onnx
index 7b2bb587f78..f9e9844cb8e 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_3d_dilations/model.onnx and b/onnx/backend/test/data/node/test_maxpool_3d_dilations/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl/model.onnx b/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl/model.onnx
index b1e4d1e0f6a..381c4957c39 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl/model.onnx and b/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl_large/model.onnx b/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl_large/model.onnx
index 3f27feed751..9509ad6e4a9 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl_large/model.onnx and b/onnx/backend/test/data/node/test_maxpool_3d_dilations_use_ref_impl_large/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_pads/model.onnx b/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_pads/model.onnx
index 352bd92939a..3abdc840c54 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_pads/model.onnx and b/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_pads/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_strides/model.onnx b/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_strides/model.onnx
index 2d1d01124b6..a67af2fbf6b 100644
Binary files a/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_strides/model.onnx and b/onnx/backend/test/data/node/test_maxpool_with_argmax_2d_precomputed_strides/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxunpool_export_with_output_shape/model.onnx b/onnx/backend/test/data/node/test_maxunpool_export_with_output_shape/model.onnx
index 117892ddba7..04673fcc00e 100644
Binary files a/onnx/backend/test/data/node/test_maxunpool_export_with_output_shape/model.onnx and b/onnx/backend/test/data/node/test_maxunpool_export_with_output_shape/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_maxunpool_export_without_output_shape/model.onnx b/onnx/backend/test/data/node/test_maxunpool_export_without_output_shape/model.onnx
index afc56ebff14..38cc84cc23c 100644
Binary files a/onnx/backend/test/data/node/test_maxunpool_export_without_output_shape/model.onnx and b/onnx/backend/test/data/node/test_maxunpool_export_without_output_shape/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_mish/model.onnx b/onnx/backend/test/data/node/test_mish/model.onnx
index a2218ac41b3..9b8fb3a8fb2 100644
Binary files a/onnx/backend/test/data/node/test_mish/model.onnx and b/onnx/backend/test/data/node/test_mish/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_mish_expanded/model.onnx b/onnx/backend/test/data/node/test_mish_expanded/model.onnx
index 8e8f56c1944..465033dec4e 100644
Binary files a/onnx/backend/test/data/node/test_mish_expanded/model.onnx and b/onnx/backend/test/data/node/test_mish_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NC/model.onnx b/onnx/backend/test/data/node/test_nllloss_NC/model.onnx
index 53aec2ac361..0d5f13841d9 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NC/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NC/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NC_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NC_expanded/model.onnx
index fc00ac93ebe..0a13d6ffcbc 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NC_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NC_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1/model.onnx
index 470693beb69..189dc6c3c00 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_expanded/model.onnx
index 20d2ead61dd..d63471102cd 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_ii/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_ii/model.onnx
index c855c5a81b3..b823ccbb2fb 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_ii/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_ii/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_ii_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_ii_expanded/model.onnx
index 34ee1d47465..c75e4ea1f78 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_ii_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_ii_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii/model.onnx
index 31bd38f307b..e9d99c82bff 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii_expanded/model.onnx
index b1693806e20..6f6a3f21ff2 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_mean_weight_negative_ii_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_weight/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_weight/model.onnx
index 360bc70a91c..902f9a0d0b0 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_weight/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_weight/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_weight_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_weight_expanded/model.onnx
index 11bcc65caf9..2ec9919837f 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_weight_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_weight_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii/model.onnx
index 086297336b6..425f977a785 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii_expanded/model.onnx
index 49017ae3158..4f85c65e976 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1_weight_ii_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2/model.onnx
index 76f4cfaa31e..b91f1dfac1a 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_expanded/model.onnx
index 72777a5c1f1..e07381d064d 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii/model.onnx
index 50b3c48cebc..d9bc385e3a6 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded/model.onnx
index b07bb7d978d..59ee8e5d8a7 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean/model.onnx
index f47a0d141e2..fd66cfa636a 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean_expanded/model.onnx
index f8a3754b0a7..4cc60dbe042 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_mean_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum/model.onnx
index 36b94d996e3..d91ed9a8371 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum_expanded/model.onnx
index 447cca891c9..e2899d978ae 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_reduction_sum_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight/model.onnx
index afeef04cc45..1fbc64e773b 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_expanded/model.onnx
index bb187cea67d..5b9611af54a 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean/model.onnx
index 19ae33e1f2e..5e928bfc0e5 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean_expanded/model.onnx
index 50f66085ace..d8bd66f76a2 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_mean_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum/model.onnx
index fcc48702c96..50d52ec64b8 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_expanded/model.onnx
index 06e5b65e6d0..4dd5a25b1fc 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii/model.onnx
index 30e926e943b..5fb3c75bfe3 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded/model.onnx
index acc84e0bef1..31c93d17e97 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii/model.onnx
index f1022a4bd9f..f7f4ff93df4 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded/model.onnx
index 4f236df7a8c..0416417bfd7 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii/model.onnx
index 8bb7c542065..eebc8c5b7f1 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded/model.onnx
index ea7d7177f20..4d8c1c76e33 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight/model.onnx
index 248908917d7..178faf552df 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight_expanded/model.onnx
index db7aea4c097..ede9e00357a 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_mean_weight_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight/model.onnx
index e32ea857f6d..07005b6d702 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded/model.onnx b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded/model.onnx
index e919d1d37c2..89543e5650c 100644
Binary files a/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded/model.onnx and b/onnx/backend/test/data/node/test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random/test_data_set_0/output_0.pb
index d84c5d549e8..10171914572 100644
--- a/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
 �1�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random_expanded/test_data_set_0/output_0.pb
index d84c5d549e8..10171914572 100644
--- a/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random_expanded/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random_expanded/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
 �1�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random/test_data_set_0/output_0.pb
index e54345fc40d..f35be21b4bb 100644
--- a/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
 �1�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random_expanded/test_data_set_0/output_0.pb
index e54345fc40d..f35be21b4bb 100644
--- a/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random_expanded/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_reduce_log_sum_exp_keepdims_random_expanded/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
 �1�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random/test_data_set_0/output_0.pb
index e54345fc40d..f35be21b4bb 100644
--- a/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
 �1�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random_expanded/test_data_set_0/output_0.pb
index e54345fc40d..f35be21b4bb 100644
--- a/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random_expanded/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_reduce_log_sum_exp_negative_axes_keepdims_random_expanded/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
 �1�?
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_rnn_seq_length/model.onnx b/onnx/backend/test/data/node/test_rnn_seq_length/model.onnx
index e8273e546cd..7a025009f9b 100644
Binary files a/onnx/backend/test/data/node/test_rnn_seq_length/model.onnx and b/onnx/backend/test/data/node/test_rnn_seq_length/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_roialign_aligned_false/model.onnx b/onnx/backend/test/data/node/test_roialign_aligned_false/model.onnx
index 36f9ea44fdf..cd9dfe7c2de 100644
Binary files a/onnx/backend/test/data/node/test_roialign_aligned_false/model.onnx and b/onnx/backend/test/data/node/test_roialign_aligned_false/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_roialign_aligned_true/model.onnx b/onnx/backend/test/data/node/test_roialign_aligned_true/model.onnx
index 0626de8e12c..eb22ab1cbac 100644
Binary files a/onnx/backend/test/data/node/test_roialign_aligned_true/model.onnx and b/onnx/backend/test/data/node/test_roialign_aligned_true/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_roialign_mode_max/model.onnx b/onnx/backend/test/data/node/test_roialign_mode_max/model.onnx
index c49bcbcebde..c27207af408 100644
Binary files a/onnx/backend/test/data/node/test_roialign_mode_max/model.onnx and b/onnx/backend/test/data/node/test_roialign_mode_max/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_round/model.onnx b/onnx/backend/test/data/node/test_round/model.onnx
index 5551b49085d..385814c703d 100644
Binary files a/onnx/backend/test/data/node/test_round/model.onnx and b/onnx/backend/test/data/node/test_round/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_selu/model.onnx b/onnx/backend/test/data/node/test_selu/model.onnx
index 969a340a741..364ae843613 100644
Binary files a/onnx/backend/test/data/node/test_selu/model.onnx and b/onnx/backend/test/data/node/test_selu/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_selu_default/model.onnx b/onnx/backend/test/data/node/test_selu_default/model.onnx
index 72e2bd3f48a..32f7c62f4cd 100644
Binary files a/onnx/backend/test/data/node/test_selu_default/model.onnx and b/onnx/backend/test/data/node/test_selu_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_selu_example/model.onnx b/onnx/backend/test/data/node/test_selu_example/model.onnx
index b611f4c795f..d822ec6a6a0 100644
Binary files a/onnx/backend/test/data/node/test_selu_example/model.onnx and b/onnx/backend/test/data/node/test_selu_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_simple_rnn_batchwise/model.onnx b/onnx/backend/test/data/node/test_simple_rnn_batchwise/model.onnx
index 1427cacbe9a..8a52d04d730 100644
Binary files a/onnx/backend/test/data/node/test_simple_rnn_batchwise/model.onnx and b/onnx/backend/test/data/node/test_simple_rnn_batchwise/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_simple_rnn_defaults/model.onnx b/onnx/backend/test/data/node/test_simple_rnn_defaults/model.onnx
index d2d636dde0d..b376c00d659 100644
Binary files a/onnx/backend/test/data/node/test_simple_rnn_defaults/model.onnx and b/onnx/backend/test/data/node/test_simple_rnn_defaults/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_simple_rnn_with_initial_bias/model.onnx b/onnx/backend/test/data/node/test_simple_rnn_with_initial_bias/model.onnx
index c87dded2272..75707202c8d 100644
Binary files a/onnx/backend/test/data/node/test_simple_rnn_with_initial_bias/model.onnx and b/onnx/backend/test/data/node/test_simple_rnn_with_initial_bias/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_sin/model.onnx b/onnx/backend/test/data/node/test_sin/model.onnx
index 724fc74d610..23fdf09faf5 100644
Binary files a/onnx/backend/test/data/node/test_sin/model.onnx and b/onnx/backend/test/data/node/test_sin/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_sin_example/model.onnx b/onnx/backend/test/data/node/test_sin_example/model.onnx
index abdb5ae6e3e..a2678a54181 100644
Binary files a/onnx/backend/test/data/node/test_sin_example/model.onnx and b/onnx/backend/test/data/node/test_sin_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_sinh/model.onnx b/onnx/backend/test/data/node/test_sinh/model.onnx
index 496db2d0821..54cb1d7cdc3 100644
Binary files a/onnx/backend/test/data/node/test_sinh/model.onnx and b/onnx/backend/test/data/node/test_sinh/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_sinh_example/model.onnx b/onnx/backend/test/data/node/test_sinh_example/model.onnx
index 640cf0a01e8..b47d538826c 100644
Binary files a/onnx/backend/test/data/node/test_sinh_example/model.onnx and b/onnx/backend/test/data/node/test_sinh_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_softplus/model.onnx b/onnx/backend/test/data/node/test_softplus/model.onnx
index 8134a2da809..4c2f549a258 100644
Binary files a/onnx/backend/test/data/node/test_softplus/model.onnx and b/onnx/backend/test/data/node/test_softplus/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_softplus_example/model.onnx b/onnx/backend/test/data/node/test_softplus_example/model.onnx
index 71e49dca204..ce4dbff6348 100644
Binary files a/onnx/backend/test/data/node/test_softplus_example/model.onnx and b/onnx/backend/test/data/node/test_softplus_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_softsign/model.onnx b/onnx/backend/test/data/node/test_softsign/model.onnx
index 8da75bcf950..08a665a020b 100644
Binary files a/onnx/backend/test/data/node/test_softsign/model.onnx and b/onnx/backend/test/data/node/test_softsign/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_softsign_example/model.onnx b/onnx/backend/test/data/node/test_softsign_example/model.onnx
index 215aa383289..08202cbeeb3 100644
Binary files a/onnx/backend/test/data/node/test_softsign_example/model.onnx and b/onnx/backend/test/data/node/test_softsign_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_tan/model.onnx b/onnx/backend/test/data/node/test_tan/model.onnx
index 33166fd26ce..3d0edfc0314 100644
Binary files a/onnx/backend/test/data/node/test_tan/model.onnx and b/onnx/backend/test/data/node/test_tan/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_tan_example/model.onnx b/onnx/backend/test/data/node/test_tan_example/model.onnx
index 36bb4a7a285..f03792d990b 100644
Binary files a/onnx/backend/test/data/node/test_tan_example/model.onnx and b/onnx/backend/test/data/node/test_tan_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_thresholdedrelu/model.onnx b/onnx/backend/test/data/node/test_thresholdedrelu/model.onnx
index 2e7eaf545e2..32d9a52c9b9 100644
Binary files a/onnx/backend/test/data/node/test_thresholdedrelu/model.onnx and b/onnx/backend/test/data/node/test_thresholdedrelu/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_thresholdedrelu_default/model.onnx b/onnx/backend/test/data/node/test_thresholdedrelu_default/model.onnx
index 73ecc6235fa..bc486141ff9 100644
Binary files a/onnx/backend/test/data/node/test_thresholdedrelu_default/model.onnx and b/onnx/backend/test/data/node/test_thresholdedrelu_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_thresholdedrelu_example/model.onnx b/onnx/backend/test/data/node/test_thresholdedrelu_example/model.onnx
index 3a4df8b2521..9745441bd2e 100644
Binary files a/onnx/backend/test/data/node/test_thresholdedrelu_example/model.onnx and b/onnx/backend/test/data/node/test_thresholdedrelu_example/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_training_dropout/model.onnx b/onnx/backend/test/data/node/test_training_dropout/model.onnx
index 2666f9f7eeb..2fb3db0fa68 100644
Binary files a/onnx/backend/test/data/node/test_training_dropout/model.onnx and b/onnx/backend/test/data/node/test_training_dropout/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_training_dropout_default/model.onnx b/onnx/backend/test/data/node/test_training_dropout_default/model.onnx
index 3dd1fade7c5..2a02f262702 100644
Binary files a/onnx/backend/test/data/node/test_training_dropout_default/model.onnx and b/onnx/backend/test/data/node/test_training_dropout_default/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_training_dropout_default_mask/model.onnx b/onnx/backend/test/data/node/test_training_dropout_default_mask/model.onnx
index f522d20666f..73f140c1494 100644
Binary files a/onnx/backend/test/data/node/test_training_dropout_default_mask/model.onnx and b/onnx/backend/test/data/node/test_training_dropout_default_mask/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_training_dropout_mask/model.onnx b/onnx/backend/test/data/node/test_training_dropout_mask/model.onnx
index a6ea42104b3..7a2cf92a496 100644
Binary files a/onnx/backend/test/data/node/test_training_dropout_mask/model.onnx and b/onnx/backend/test/data/node/test_training_dropout_mask/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_training_dropout_zero_ratio/model.onnx b/onnx/backend/test/data/node/test_training_dropout_zero_ratio/model.onnx
index dca24281827..2551754e947 100644
Binary files a/onnx/backend/test/data/node/test_training_dropout_zero_ratio/model.onnx and b/onnx/backend/test/data/node/test_training_dropout_zero_ratio/model.onnx differ
diff --git a/onnx/backend/test/data/node/test_training_dropout_zero_ratio_mask/model.onnx b/onnx/backend/test/data/node/test_training_dropout_zero_ratio_mask/model.onnx
index 570c0a6e991..58821f231ea 100644
Binary files a/onnx/backend/test/data/node/test_training_dropout_zero_ratio_mask/model.onnx and b/onnx/backend/test/data/node/test_training_dropout_zero_ratio_mask/model.onnx differ
diff --git a/onnx/defs/generator/defs.cc b/onnx/defs/generator/defs.cc
index 7269f7717eb..69babbb0bf1 100644
--- a/onnx/defs/generator/defs.cc
+++ b/onnx/defs/generator/defs.cc
@@ -126,7 +126,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
-static const char* EyeLike_ver9_doc = R"DOC(
+static const char* EyeLike_ver22_doc = R"DOC(
 Generate a 2D tensor (matrix) with ones on the diagonal and zeros everywhere else. Only 2D
 tensors are supported, i.e. input T1 must be of rank 2. The shape of the output tensor is the
 same as the input tensor. The data type can be specified by the 'dtype' argument. If
@@ -138,9 +138,9 @@ TensorProto message and be valid as an output type.
 
 ONNX_OPERATOR_SET_SCHEMA(
     EyeLike,
-    9,
+    22,
     OpSchema()
-        .SetDoc(EyeLike_ver9_doc)
+        .SetDoc(EyeLike_ver22_doc)
         .Attr(
             "k",
             "(Optional) Index of the diagonal to be populated with ones. Default is 0."
@@ -159,33 +159,11 @@ ONNX_OPERATOR_SET_SCHEMA(
         .Output(0, "output", "Output tensor, same shape as input tensor T1.", "T2")
         .TypeConstraint(
             "T1",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(bool)"},
+            OpSchema::all_non_complex_numeric_types_plus_bool_ir4(),
             "Constrain input types. Strings and complex are not supported.")
         .TypeConstraint(
             "T2",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(bool)"},
+            OpSchema::all_non_complex_numeric_types_plus_bool_ir4(),
             "Constrain output types. Strings and complex are not supported.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           if (ctx.getAttribute("dtype") != nullptr) {
@@ -202,7 +180,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           propagateShapeFromInputToOutput(ctx, 0, 0);
         }));
 
-static const char* RandomUniform_ver1_doc = R"DOC(
+static const char* RandomUniform_ver22_doc = R"DOC(
 Generate a tensor with random values drawn from a uniform distribution. The shape
 of the tensor is specified by the `shape` argument and the range by `low` and `high`.
 
@@ -213,9 +191,9 @@ TensorProto message.
 
 ONNX_OPERATOR_SET_SCHEMA(
     RandomUniform,
-    1,
+    22,
     OpSchema()
-        .SetDoc(RandomUniform_ver1_doc)
+        .SetDoc(RandomUniform_ver22_doc)
         .Attr("low", "Lower boundary of the output values.", AttributeProto::FLOAT, 0.0f)
         .Attr("high", "Upper boundary of the output values.", AttributeProto::FLOAT, 1.0f)
         .Attr(
@@ -230,16 +208,13 @@ ONNX_OPERATOR_SET_SCHEMA(
             static_cast<int64_t>(TensorProto::FLOAT))
         .Attr("shape", "The shape of the output tensor.", AttributeProto::INTS)
         .Output(0, "output", "Output tensor of random values drawn from uniform distribution", "T")
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain output types to float tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0, TensorProto::FLOAT);
           propagateShapeFromAttributeToOutput(ctx, "shape", 0);
         }));
 
-static const char* RandomNormal_ver1_doc = R"DOC(
+static const char* RandomNormal_ver22_doc = R"DOC(
 Generate a tensor with random values drawn from a normal distribution. The shape
 of the tensor is specified by the `shape` argument and the parameter of the normal distribution
 specified by `mean` and `scale`.
@@ -251,9 +226,9 @@ TensorProto message.
 
 ONNX_OPERATOR_SET_SCHEMA(
     RandomNormal,
-    1,
+    22,
     OpSchema()
-        .SetDoc(RandomNormal_ver1_doc)
+        .SetDoc(RandomNormal_ver22_doc)
         .Attr("mean", "The mean of the normal distribution.", AttributeProto::FLOAT, 0.0f)
         .Attr("scale", "The standard deviation of the normal distribution.", AttributeProto::FLOAT, 1.0f)
         .Attr(
@@ -268,16 +243,13 @@ ONNX_OPERATOR_SET_SCHEMA(
             static_cast<int64_t>(TensorProto::FLOAT))
         .Attr("shape", "The shape of the output tensor.", AttributeProto::INTS)
         .Output(0, "output", "Output tensor of random values drawn from normal distribution", "T")
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain output types to float tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0, TensorProto::FLOAT);
           propagateShapeFromAttributeToOutput(ctx, "shape", 0);
         }));
 
-static const char* RandomUniformLike_ver1_doc = R"DOC(
+static const char* RandomUniformLike_ver22_doc = R"DOC(
 Generate a tensor with random values drawn from a uniform distribution.
 The shape of the output tensor is copied from the shape of the input tensor,
 and the parameters of the uniform distribution are specified by `low` and `high`.
@@ -289,9 +261,9 @@ TensorProto message and be valid as an output type.
 
 ONNX_OPERATOR_SET_SCHEMA(
     RandomUniformLike,
-    1,
+    22,
     OpSchema()
-        .SetDoc(RandomUniformLike_ver1_doc)
+        .SetDoc(RandomUniformLike_ver22_doc)
         .Attr("low", "Lower boundary of the output values.", AttributeProto::FLOAT, 0.0f)
         .Attr("high", "Upper boundary of the output values.", AttributeProto::FLOAT, 1.0f)
         .Attr(
@@ -309,12 +281,9 @@ ONNX_OPERATOR_SET_SCHEMA(
         .Output(0, "output", "Output tensor of random values drawn from uniform distribution", "T2")
         .TypeConstraint(
             "T1",
-            OpSchema::all_tensor_types(),
+            OpSchema::all_tensor_types_ir4(),
             "Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.")
-        .TypeConstraint(
-            "T2",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain output types to float tensors.")
+        .TypeConstraint("T2", OpSchema::all_float_types_ir4(), "Constrain output types to float tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           if (ctx.getAttribute("dtype") != nullptr)
             propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0);
@@ -326,7 +295,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           propagateShapeFromInputToOutput(ctx, 0, 0);
         }));
 
-static const char* RandomNormalLike_ver1_doc = R"DOC(
+static const char* RandomNormalLike_ver22_doc = R"DOC(
 Generate a tensor with random values drawn from a normal distribution.
 The shape of the output tensor is copied from the shape of the input tensor,
 and the parameters of the normal distribution are specified by `mean` and `scale`.
@@ -338,9 +307,9 @@ TensorProto message, and be valid as an output type.
 
 ONNX_OPERATOR_SET_SCHEMA(
     RandomNormalLike,
-    1,
+    22,
     OpSchema()
-        .SetDoc(RandomNormalLike_ver1_doc)
+        .SetDoc(RandomNormalLike_ver22_doc)
         .Attr("mean", "The mean of the normal distribution.", AttributeProto::FLOAT, 0.0f)
         .Attr("scale", "The standard deviation of the normal distribution.", AttributeProto::FLOAT, 1.0f)
         .Attr(
@@ -358,12 +327,9 @@ ONNX_OPERATOR_SET_SCHEMA(
         .Output(0, "output", "Output tensor of random values drawn from normal distribution", "T2")
         .TypeConstraint(
             "T1",
-            OpSchema::all_tensor_types(),
+            OpSchema::all_tensor_types_ir4(),
             "Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.")
-        .TypeConstraint(
-            "T2",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain output types to float tensors.")
+        .TypeConstraint("T2", OpSchema::all_float_types_ir4(), "Constrain output types to float tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           if (ctx.getAttribute("dtype") != nullptr)
             propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0);
@@ -375,16 +341,16 @@ ONNX_OPERATOR_SET_SCHEMA(
           propagateShapeFromInputToOutput(ctx, 0, 0);
         }));
 
-static const char* Multinomial_ver7_doc = R"DOC(
+static const char* Multinomial_ver22_doc = R"DOC(
 Generate a tensor of samples from a multinomial distribution according to the probabilities
 of each of the possible outcomes.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Multinomial,
-    7,
+    22,
     OpSchema()
-        .SetDoc(Multinomial_ver7_doc)
+        .SetDoc(Multinomial_ver22_doc)
         .Attr("sample_size", "Number of times to sample.", AttributeProto::INT, static_cast<int64_t>(1))
         .Attr(
             "seed",
@@ -406,10 +372,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "output",
             "Output tensor with shape [batch_size, sample_size], where sample_size is the number of times to sample. Each value along the axis zero represents the outcome of the corresponding sample in a batch.",
             "T2")
-        .TypeConstraint(
-            "T1",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input types to float tensors.")
+        .TypeConstraint("T1", OpSchema::all_float_types_ir4(), "Constrain input types to float tensors.")
         .TypeConstraint("T2", {"tensor(int32)", "tensor(int64)"}, "Constrain output types to integral tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           auto dtype = ctx.getAttribute("dtype");
@@ -562,7 +525,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
-static const char* Bernoulli_ver15_doc = R"DOC(
+static const char* Bernoulli_ver22_doc = R"DOC(
 Draws binary random numbers (0 or 1) from a Bernoulli distribution. The input tensor should be a tensor
 containing probabilities p (a value in the range [0,1]) to be used for drawing the binary random number,
 where an output of 1 is produced with probability p and an output of 0 is produced with probability (1-p).
@@ -573,9 +536,9 @@ implementations (even if a seed is specified).
 
 ONNX_OPERATOR_SET_SCHEMA(
     Bernoulli,
-    15,
+    22,
     OpSchema()
-        .SetDoc(Bernoulli_ver15_doc)
+        .SetDoc(Bernoulli_ver22_doc)
         .Attr(
             "seed",
             "(Optional) Seed to the random generator, if not specified we will auto generate one.",
@@ -589,25 +552,10 @@ ONNX_OPERATOR_SET_SCHEMA(
             OPTIONAL_VALUE)
         .Input(0, "input", "All values in input have to be in the range:[0, 1].", "T1")
         .Output(0, "output", "The returned output tensor only has values 0 or 1, same shape as input tensor.", "T2")
-        .TypeConstraint(
-            "T1",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input types to float tensors.")
+        .TypeConstraint("T1", OpSchema::all_float_types_ir4(), "Constrain input types to float tensors.")
         .TypeConstraint(
             "T2",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(bfloat16)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(bool)"},
+            OpSchema::all_non_complex_numeric_types_plus_bool_ir4(),
             "Constrain output types to all numeric tensors and bool tensors.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           if (ctx.getAttribute("dtype") != nullptr)
diff --git a/onnx/defs/generator/old.cc b/onnx/defs/generator/old.cc
index a0cf5b0a22c..88b7c6fc388 100644
--- a/onnx/defs/generator/old.cc
+++ b/onnx/defs/generator/old.cc
@@ -5,11 +5,400 @@
 #include <algorithm>
 #include <functional>
 
+#include "onnx/defs/function.h"
 #include "onnx/defs/generator/utils.h"
 #include "onnx/defs/schema.h"
 
 namespace ONNX_NAMESPACE {
 
+static const char* Bernoulli_ver15_doc = R"DOC(
+Draws binary random numbers (0 or 1) from a Bernoulli distribution. The input tensor should be a tensor
+containing probabilities p (a value in the range [0,1]) to be used for drawing the binary random number,
+where an output of 1 is produced with probability p and an output of 0 is produced with probability (1-p).
+
+This operator is non-deterministic and may not produce the same values in different
+implementations (even if a seed is specified).
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Bernoulli,
+    15,
+    OpSchema()
+        .SetDoc(Bernoulli_ver15_doc)
+        .Attr(
+            "seed",
+            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
+            AttributeProto::FLOAT,
+            OPTIONAL_VALUE)
+        .Attr(
+            "dtype",
+            "The data type for the elements of the output tensor. if not specified, we will use "
+            "the data type of the input tensor.",
+            AttributeProto::INT,
+            OPTIONAL_VALUE)
+        .Input(0, "input", "All values in input have to be in the range:[0, 1].", "T1")
+        .Output(0, "output", "The returned output tensor only has values 0 or 1, same shape as input tensor.", "T2")
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input types to float tensors.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(bfloat16)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(bool)"},
+            "Constrain output types to all numeric tensors and bool tensors.")
+        .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+          if (ctx.getAttribute("dtype") != nullptr)
+            propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0);
+          else
+            propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (!hasNInputShapes(ctx, 1)) {
+            return;
+          }
+          propagateShapeFromInputToOutput(ctx, 0, 0);
+        })
+        .SetContextDependentFunctionBodyBuilder(
+            [](const FunctionBodyBuildContext& ctx, const OpSchema& schema, FunctionProto& functionProto) -> bool {
+              if (ctx.getInputType(0) == nullptr) {
+                // we cannot create a correct function body without knowing the input type
+                return false;
+              }
+              auto input_type = ctx.getInputType(0)->tensor_type().elem_type();
+              auto dtype = ctx.getAttribute("dtype") != nullptr
+                  ? static_cast<TensorProto_DataType>(ctx.getAttribute("dtype")->i())
+                  : input_type;
+              FunctionBuilder builder(functionProto);
+              builder
+                  .Add(
+                      "X_random = RandomUniformLike <low = 0.0, high = 1.0, seed = @seed> (input)",
+                      "dtype",
+                      int64_t(input_type))
+                  .Add("X_greater = Greater (X_random, input)")
+                  .Add("output = Cast (X_greater)", "to", int64_t(dtype));
+              schema.BuildFunction(functionProto);
+              return true;
+            }));
+
+static const char* Multinomial_ver7_doc = R"DOC(
+Generate a tensor of samples from a multinomial distribution according to the probabilities
+of each of the possible outcomes.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Multinomial,
+    7,
+    OpSchema()
+        .SetDoc(Multinomial_ver7_doc)
+        .Attr("sample_size", "Number of times to sample.", AttributeProto::INT, static_cast<int64_t>(1))
+        .Attr(
+            "seed",
+            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
+            AttributeProto::FLOAT,
+            OPTIONAL_VALUE)
+        .Attr(
+            "dtype",
+            "(Optional) The data type for the elements of the output tensor, if not specified, we will use int32.",
+            AttributeProto::INT,
+            static_cast<int64_t>(TensorProto::INT32))
+        .Input(
+            0,
+            "input",
+            "Input tensor with shape [batch_size, class_size], where class_size is the number of all possible outcomes. Each value along the axis zero represents the unnormalized log-probability of each corresponding outcome in a batch.",
+            "T1")
+        .Output(
+            0,
+            "output",
+            "Output tensor with shape [batch_size, sample_size], where sample_size is the number of times to sample. Each value along the axis zero represents the outcome of the corresponding sample in a batch.",
+            "T2")
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input types to float tensors.")
+        .TypeConstraint("T2", {"tensor(int32)", "tensor(int64)"}, "Constrain output types to integral tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          auto dtype = ctx.getAttribute("dtype");
+          auto dataType = TensorProto_DataType::TensorProto_DataType_INT32;
+          if (dtype != nullptr) {
+            dataType = static_cast<TensorProto_DataType>(dtype->i());
+            if (dataType != TensorProto_DataType::TensorProto_DataType_INT32 &&
+                dataType != TensorProto_DataType::TensorProto_DataType_INT64) {
+              fail_type_inference("Output type must be int32 or int64");
+            }
+          }
+          updateOutputElemType(ctx, 0, dataType);
+
+          TensorShapeProto::Dimension batch_size, sample_size;
+          if (hasInputShape(ctx, 0)) {
+            auto& input_shape = getInputShape(ctx, 0);
+            if (input_shape.dim_size() != 2) {
+              fail_shape_inference("Input tensor must have rank 2");
+            }
+            batch_size = input_shape.dim(0);
+          } // else statically-unknown batch-size
+          sample_size.set_dim_value(getAttribute(ctx, "sample_size", 1));
+          updateOutputShape(ctx, 0, {batch_size, sample_size});
+        }));
+
+static const char* RandomNormalLike_ver1_doc = R"DOC(
+Generate a tensor with random values drawn from a normal distribution.
+The shape of the output tensor is copied from the shape of the input tensor,
+and the parameters of the normal distribution are specified by `mean` and `scale`.
+
+The data type is specified by the 'dtype' argument, or copied from the input tensor if not provided.
+The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
+TensorProto message, and be valid as an output type.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    RandomNormalLike,
+    1,
+    OpSchema()
+        .SetDoc(RandomNormalLike_ver1_doc)
+        .Attr("mean", "The mean of the normal distribution.", AttributeProto::FLOAT, 0.0f)
+        .Attr("scale", "The standard deviation of the normal distribution.", AttributeProto::FLOAT, 1.0f)
+        .Attr(
+            "seed",
+            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
+            AttributeProto::FLOAT,
+            OPTIONAL_VALUE)
+        .Attr(
+            "dtype",
+            "(Optional) The data type for the elements of the output tensor, if not specified, we will use "
+            "the data type of the input tensor.",
+            AttributeProto::INT,
+            OPTIONAL_VALUE)
+        .Input(0, "input", "Input tensor to copy shape and optionally type information from.", "T1")
+        .Output(0, "output", "Output tensor of random values drawn from normal distribution", "T2")
+        .TypeConstraint(
+            "T1",
+            OpSchema::all_tensor_types(),
+            "Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain output types to float tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          if (ctx.getAttribute("dtype") != nullptr)
+            propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0);
+          else
+            propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (!hasNInputShapes(ctx, 1)) {
+            return;
+          }
+          propagateShapeFromInputToOutput(ctx, 0, 0);
+        }));
+
+static const char* RandomUniformLike_ver1_doc = R"DOC(
+Generate a tensor with random values drawn from a uniform distribution.
+The shape of the output tensor is copied from the shape of the input tensor,
+and the parameters of the uniform distribution are specified by `low` and `high`.
+
+The data type is specified by the 'dtype' argument, or copied from the input tensor if not provided.
+The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
+TensorProto message and be valid as an output type.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    RandomUniformLike,
+    1,
+    OpSchema()
+        .SetDoc(RandomUniformLike_ver1_doc)
+        .Attr("low", "Lower boundary of the output values.", AttributeProto::FLOAT, 0.0f)
+        .Attr("high", "Upper boundary of the output values.", AttributeProto::FLOAT, 1.0f)
+        .Attr(
+            "seed",
+            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
+            AttributeProto::FLOAT,
+            OPTIONAL_VALUE)
+        .Attr(
+            "dtype",
+            "(Optional) The data type for the elements of the output tensor, if not specified, we will use "
+            "the data type of the input tensor.",
+            AttributeProto::INT,
+            OPTIONAL_VALUE)
+        .Input(0, "input", "Input tensor to copy shape and optionally type information from.", "T1")
+        .Output(0, "output", "Output tensor of random values drawn from uniform distribution", "T2")
+        .TypeConstraint(
+            "T1",
+            OpSchema::all_tensor_types(),
+            "Constrain to any tensor type. If the dtype attribute is not provided this must be a valid output type.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain output types to float tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          if (ctx.getAttribute("dtype") != nullptr)
+            propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0);
+          else
+            propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (!hasNInputShapes(ctx, 1)) {
+            return;
+          }
+          propagateShapeFromInputToOutput(ctx, 0, 0);
+        }));
+
+static const char* RandomNormal_ver1_doc = R"DOC(
+Generate a tensor with random values drawn from a normal distribution. The shape
+of the tensor is specified by the `shape` argument and the parameter of the normal distribution
+specified by `mean` and `scale`.
+
+The data type is specified by the 'dtype' argument. The 'dtype' argument must
+be one of the data types specified in the 'DataType' enum field in the
+TensorProto message.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    RandomNormal,
+    1,
+    OpSchema()
+        .SetDoc(RandomNormal_ver1_doc)
+        .Attr("mean", "The mean of the normal distribution.", AttributeProto::FLOAT, 0.0f)
+        .Attr("scale", "The standard deviation of the normal distribution.", AttributeProto::FLOAT, 1.0f)
+        .Attr(
+            "seed",
+            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
+            AttributeProto::FLOAT,
+            OPTIONAL_VALUE)
+        .Attr(
+            "dtype",
+            "The data type for the elements of the output tensor. Default is TensorProto::FLOAT.",
+            AttributeProto::INT,
+            static_cast<int64_t>(TensorProto::FLOAT))
+        .Attr("shape", "The shape of the output tensor.", AttributeProto::INTS)
+        .Output(0, "output", "Output tensor of random values drawn from normal distribution", "T")
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain output types to float tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0, TensorProto::FLOAT);
+          propagateShapeFromAttributeToOutput(ctx, "shape", 0);
+        }));
+
+static const char* RandomUniform_ver1_doc = R"DOC(
+Generate a tensor with random values drawn from a uniform distribution. The shape
+of the tensor is specified by the `shape` argument and the range by `low` and `high`.
+
+The data type is specified by the 'dtype' argument. The 'dtype' argument must
+be one of the data types specified in the 'DataType' enum field in the
+TensorProto message.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    RandomUniform,
+    1,
+    OpSchema()
+        .SetDoc(RandomUniform_ver1_doc)
+        .Attr("low", "Lower boundary of the output values.", AttributeProto::FLOAT, 0.0f)
+        .Attr("high", "Upper boundary of the output values.", AttributeProto::FLOAT, 1.0f)
+        .Attr(
+            "seed",
+            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
+            AttributeProto::FLOAT,
+            OPTIONAL_VALUE)
+        .Attr(
+            "dtype",
+            "The data type for the elements of the output tensor. If not specified, default is TensorProto::FLOAT.",
+            AttributeProto::INT,
+            static_cast<int64_t>(TensorProto::FLOAT))
+        .Attr("shape", "The shape of the output tensor.", AttributeProto::INTS)
+        .Output(0, "output", "Output tensor of random values drawn from uniform distribution", "T")
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain output types to float tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0, TensorProto::FLOAT);
+          propagateShapeFromAttributeToOutput(ctx, "shape", 0);
+        }));
+
+static const char* EyeLike_ver9_doc = R"DOC(
+Generate a 2D tensor (matrix) with ones on the diagonal and zeros everywhere else. Only 2D
+tensors are supported, i.e. input T1 must be of rank 2. The shape of the output tensor is the
+same as the input tensor. The data type can be specified by the 'dtype' argument. If
+'dtype' is not specified, then the type of input tensor is used. By default, the main diagonal
+is populated with ones, but attribute 'k' can be used to populate upper or lower diagonals.
+The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the
+TensorProto message and be valid as an output type.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    EyeLike,
+    9,
+    OpSchema()
+        .SetDoc(EyeLike_ver9_doc)
+        .Attr(
+            "k",
+            "(Optional) Index of the diagonal to be populated with ones. Default is 0."
+            " If T2 is the output, this op sets T2[i, i+k] = 1. k = 0 populates the main diagonal, "
+            "k > 0 populates an upper diagonal,  and k < 0 populates a lower diagonal.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Attr(
+            "dtype",
+            "(Optional) The data type for the elements of the output tensor. If not specified,"
+            "the data type of the input tensor T1 is used. If input tensor T1 is also not"
+            "specified, then type defaults to 'float'.",
+            AttributeProto::INT,
+            OPTIONAL_VALUE)
+        .Input(0, "input", "2D input tensor to copy shape, and optionally, type information from.", "T1")
+        .Output(0, "output", "Output tensor, same shape as input tensor T1.", "T2")
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(bool)"},
+            "Constrain input types. Strings and complex are not supported.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(bool)"},
+            "Constrain output types. Strings and complex are not supported.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          if (ctx.getAttribute("dtype") != nullptr) {
+            propagateElemTypeFromAttributeToOutput(ctx, "dtype", 0);
+          } else {
+            propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          }
+          if (hasInputShape(ctx, 0)) {
+            auto& input_shape = getInputShape(ctx, 0);
+            if (input_shape.dim_size() != 2) {
+              fail_shape_inference("Input tensor must be 2-dimensional");
+            }
+          }
+          propagateShapeFromInputToOutput(ctx, 0, 0);
+        }));
+
 static const char* Constant_ver19_doc = R"DOC(
 This operator produces a constant tensor. Exactly one of the provided attributes, either value, sparse_value,
 or value_* must be specified.
diff --git a/onnx/defs/math/defs.cc b/onnx/defs/math/defs.cc
index b7dfe3c86ff..c315a2a7b7f 100644
--- a/onnx/defs/math/defs.cc
+++ b/onnx/defs/math/defs.cc
@@ -14,17 +14,6 @@
 
 namespace ONNX_NAMESPACE {
 
-inline int MathOpTwoIntegers(std::string op_type, int a, int b) {
-  if (op_type == "Add") {
-    return a + b;
-  } else if (op_type == "Sub") {
-    return a - b;
-  } else if (op_type == "Mul") {
-    return a * b;
-  }
-  fail_shape_inference("Wrong op_type name for running propagation: ", op_type);
-}
-
 inline void MathOpDataPropagator(DataPropagationContext& ctx, std::string op_type) {
   const auto input_0 = ctx.getInputData(0);
   const auto input_1 = ctx.getInputData(1);
@@ -43,7 +32,7 @@ inline void MathOpDataPropagator(DataPropagationContext& ctx, std::string op_typ
     auto& input_dim_1 = input_1->dim(size_1 == 1 ? 0 : i);
     if (input_dim_0.has_dim_value() && input_dim_1.has_dim_value()) {
       tsp.mutable_dim()->Add()->set_dim_value(
-          MathOpTwoIntegers(op_type, input_dim_0.dim_value(), input_dim_1.dim_value()));
+          defs::math::utils::MathOpTwoIntegers(op_type, input_dim_0.dim_value(), input_dim_1.dim_value()));
     } else {
       // Cannot compute the value; simply add an empty dim without value and param
       tsp.mutable_dim()->Add();
@@ -341,7 +330,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         )ONNX"));
 
-static const char* ThresholdedRelu_ver10_doc = R"DOC(
+static const char* ThresholdedRelu_ver22_doc = R"DOC(
 ThresholdedRelu takes one input data (Tensor<T>) and produces one output data
 (Tensor<T>) where the rectified linear function, y = x for x > alpha, y = 0 otherwise,
 is applied to the tensor elementwise.
@@ -349,16 +338,13 @@ is applied to the tensor elementwise.
 
 ONNX_OPERATOR_SET_SCHEMA(
     ThresholdedRelu,
-    10,
+    22,
     OpSchema()
-        .SetDoc(ThresholdedRelu_ver10_doc)
+        .SetDoc(ThresholdedRelu_ver22_doc)
         .Attr("alpha", "Threshold value", AttributeProto::FLOAT, 1.0f)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
         .FunctionBody(
             R"ONNX(
@@ -373,7 +359,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         )ONNX",
             18));
 
-static const char* Selu_ver6_doc = R"DOC(
+static const char* Selu_ver22_doc = R"DOC(
 Selu takes one input data (Tensor<T>) and produces one output data
 (Tensor<T>) where the scaled exponential linear unit function,
 `y = gamma * (alpha * e^x - alpha) for x <= 0`, `y = gamma * x for x > 0`,
@@ -382,7 +368,7 @@ is applied to the tensor elementwise.
 
 ONNX_OPERATOR_SET_SCHEMA(
     Selu,
-    6,
+    22,
     OpSchema()
         .Attr(
             "alpha",
@@ -396,13 +382,10 @@ ONNX_OPERATOR_SET_SCHEMA(
             "(i.e., float32 approximation of 1.0507009873554804934193349852946).",
             AttributeProto::FLOAT,
             1.05070102214813232421875f)
-        .SetDoc(Selu_ver6_doc)
+        .SetDoc(Selu_ver22_doc)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
         .FunctionBody(
             R"ONNX(
@@ -424,7 +407,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         )ONNX",
             18));
 
-static const char* Elu_ver6_doc = R"DOC(
+static const char* Elu_ver22_doc = R"DOC(
 Elu takes one input data (Tensor<T>) and produces one output data
 (Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
 0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
@@ -433,16 +416,13 @@ Elu takes one input data (Tensor<T>) and produces one output data
 
 ONNX_OPERATOR_SET_SCHEMA(
     Elu,
-    6,
+    22,
     OpSchema()
         .Attr("alpha", "Coefficient of ELU.", AttributeProto::FLOAT, 1.0f)
-        .SetDoc(Elu_ver6_doc)
+        .SetDoc(Elu_ver22_doc)
         .Input(0, "X", "1D input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "1D output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
         .FunctionBody(
             R"ONNX(
@@ -462,7 +442,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         )ONNX",
             18));
 
-static const char* mish_ver18_doc = R"DOC(
+static const char* mish_ver22_doc = R"DOC(
 Mish: A Self Regularized Non-Monotonic Neural Activation Function.
 
 Perform the linear unit element-wise on the input tensor X using formula:
@@ -474,15 +454,12 @@ mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
 
 ONNX_OPERATOR_SET_SCHEMA(
     Mish,
-    18,
+    22,
     OpSchema()
-        .SetDoc(mish_ver18_doc)
+        .SetDoc(mish_ver22_doc)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input X and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input X and output types to float tensors.")
         .FunctionBody(R"ONNX(
           {
             Softplus_X = Softplus (X)
@@ -664,10 +641,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
 static const char* Log_ver13_doc = R"DOC(
@@ -716,10 +690,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
 static const char* Pow_ver15_doc = R"DOC(
@@ -842,7 +813,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* HardSigmoid_ver6_doc = R"DOC(
+static const char* HardSigmoid_ver22_doc = R"DOC(
 HardSigmoid takes one input data (Tensor<T>) and produces one output data
 (Tensor<T>) where the HardSigmoid function, y = max(0, min(1, alpha * x + beta)),
 is applied to the tensor elementwise.
@@ -850,17 +821,14 @@ is applied to the tensor elementwise.
 
 ONNX_OPERATOR_SET_SCHEMA(
     HardSigmoid,
-    6,
+    22,
     OpSchema()
         .Attr("alpha", "Value of alpha.", AttributeProto::FLOAT, 0.2f)
         .Attr("beta", "Value of beta.", AttributeProto::FLOAT, 0.5f)
-        .SetDoc(HardSigmoid_ver6_doc)
+        .SetDoc(HardSigmoid_ver22_doc)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
         .FunctionBody(
             R"ONNX(
@@ -881,7 +849,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         )ONNX",
             18));
 
-static const char* HardSwish_ver14_doc = R"DOC(
+static const char* HardSwish_ver22_doc = R"DOC(
 HardSwish takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where
 the HardSwish function, y = x * max(0, min(1, alpha * x + beta)) = x * HardSigmoid<alpha, beta>(x),
 where alpha = 1/6 and beta = 0.5, is applied to the tensor elementwise.
@@ -889,15 +857,12 @@ where alpha = 1/6 and beta = 0.5, is applied to the tensor elementwise.
 
 ONNX_OPERATOR_SET_SCHEMA(
     HardSwish,
-    14,
+    22,
     OpSchema()
-        .SetDoc(HardSwish_ver14_doc)
+        .SetDoc(HardSwish_ver22_doc)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
         .FunctionBody(R"ONNX(
           {
@@ -1232,15 +1197,15 @@ ONNX_OPERATOR_SET_SCHEMA(
         "hardmax",
         "Hardmax(element in input, axis) = 1 if the element is the first maximum value along the specified axis, 0 otherwise")));
 
-static const char* Softsign_ver1_doc = R"DOC(
+static const char* Softsign_ver22_doc = R"DOC(
 Calculates the softsign (x/(1+|x|)) of the given input tensor element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Softsign,
-    1,
+    22,
     OpSchema()
-        .SetDoc(Softsign_ver1_doc)
+        .SetDoc(Softsign_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1251,10 +1216,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
         .FunctionBody(
             R"ONNX(
@@ -1268,7 +1230,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         )ONNX",
             18));
 
-static const char* Softplus_ver1_doc = R"DOC(
+static const char* Softplus_ver22_doc = R"DOC(
 Softplus takes one input data (Tensor<T>) and produces one output data
 (Tensor<T>) where the softplus function, y = ln(exp(x) + 1), is applied to
 the tensor elementwise.
@@ -1276,15 +1238,12 @@ the tensor elementwise.
 
 ONNX_OPERATOR_SET_SCHEMA(
     Softplus,
-    1,
+    22,
     OpSchema()
-        .SetDoc(Softplus_ver1_doc)
+        .SetDoc(Softplus_ver22_doc)
         .Input(0, "X", "1D input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "1D input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
         .FunctionBody(
             R"ONNX(
@@ -1549,15 +1508,15 @@ ONNX_OPERATOR_SET_SCHEMA(
           return;
         }));
 
-static const char* Sin_ver7_doc = R"DOC(
+static const char* Sin_ver22_doc = R"DOC(
 Calculates the sine of the given input tensor, element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Sin,
-    7,
+    22,
     OpSchema()
-        .SetDoc(Sin_ver7_doc)
+        .SetDoc(Sin_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1569,21 +1528,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Cos_ver7_doc = R"DOC(
+static const char* Cos_ver22_doc = R"DOC(
 Calculates the cosine of the given input tensor, element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Cos,
-    7,
+    22,
     OpSchema()
-        .SetDoc(Cos_ver7_doc)
+        .SetDoc(Cos_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1595,21 +1551,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Tan_ver7_doc = R"DOC(
+static const char* Tan_ver22_doc = R"DOC(
 Calculates the tangent of the given input tensor, element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Tan,
-    7,
+    22,
     OpSchema()
-        .SetDoc(Tan_ver7_doc)
+        .SetDoc(Tan_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1621,21 +1574,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Asin_ver7_doc = R"DOC(
+static const char* Asin_ver22_doc = R"DOC(
 Calculates the arcsine (inverse of sine) of the given input tensor, element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Asin,
-    7,
+    22,
     OpSchema()
-        .SetDoc(Asin_ver7_doc)
+        .SetDoc(Asin_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1647,21 +1597,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Acos_ver7_doc = R"DOC(
+static const char* Acos_ver22_doc = R"DOC(
 Calculates the arccosine (inverse of cosine) of the given input tensor, element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Acos,
-    7,
+    22,
     OpSchema()
-        .SetDoc(Acos_ver7_doc)
+        .SetDoc(Acos_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1673,21 +1620,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Atan_ver7_doc = R"DOC(
+static const char* Atan_ver22_doc = R"DOC(
 Calculates the arctangent (inverse of tangent) of the given input tensor, element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Atan,
-    7,
+    22,
     OpSchema()
-        .SetDoc(Atan_ver7_doc)
+        .SetDoc(Atan_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1699,10 +1643,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
 static const char* Expand_ver13_doc = R"DOC(
@@ -1749,15 +1690,15 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
-static const char* Sinh_ver9_doc = R"DOC(
+static const char* Sinh_ver22_doc = R"DOC(
 Calculates the hyperbolic sine of the given input tensor element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Sinh,
-    9,
+    22,
     OpSchema()
-        .SetDoc(Sinh_ver9_doc)
+        .SetDoc(Sinh_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1769,21 +1710,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Cosh_ver9_doc = R"DOC(
+static const char* Cosh_ver22_doc = R"DOC(
 Calculates the hyperbolic cosine of the given input tensor element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Cosh,
-    9,
+    22,
     OpSchema()
-        .SetDoc(Cosh_ver9_doc)
+        .SetDoc(Cosh_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1795,21 +1733,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Asinh_ver9_doc = R"DOC(
+static const char* Asinh_ver22_doc = R"DOC(
 Calculates the hyperbolic arcsine of the given input tensor element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Asinh,
-    9,
+    22,
     OpSchema()
-        .SetDoc(Asinh_ver9_doc)
+        .SetDoc(Asinh_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1821,21 +1756,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Acosh_ver9_doc = R"DOC(
+static const char* Acosh_ver22_doc = R"DOC(
 Calculates the hyperbolic arccosine of the given input tensor element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Acosh,
-    9,
+    22,
     OpSchema()
-        .SetDoc(Acosh_ver9_doc)
+        .SetDoc(Acosh_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1847,21 +1779,18 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Atanh_ver9_doc = R"DOC(
+static const char* Atanh_ver22_doc = R"DOC(
 Calculates the hyperbolic arctangent of the given input tensor element-wise.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     Atanh,
-    9,
+    22,
     OpSchema()
-        .SetDoc(Atanh_ver9_doc)
+        .SetDoc(Atanh_ver22_doc)
         .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -1873,10 +1802,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
 static const char* Sign_ver13_doc = R"DOC(
@@ -2154,7 +2080,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         .TypeConstraint("T2", {"tensor(int32)", "tensor(int64)"}, "axis tensor can be int32 or int64 only")
         .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
 
-static const char* Round_ver11_doc = R"DOC(
+static const char* Round_ver22_doc = R"DOC(
 Round takes one input Tensor and rounds the values, element-wise, meaning
 it finds the nearest integer for each value.
 In case of halves, the rule is to round them to the nearest even integer.
@@ -2173,18 +2099,15 @@ round([-4.5]) = [-4.0]
 
 ONNX_OPERATOR_SET_SCHEMA(
     Round,
-    11,
+    22,
     OpSchema()
-        .SetDoc(Round_ver11_doc)
+        .SetDoc(Round_ver22_doc)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
-static const char* Det_ver11_doc = R"DOC(
+static const char* Det_ver22_doc = R"DOC(
 Det calculates determinant of a square matrix or batches of square matrices.
 Det takes one input tensor of shape `[*, M, M]`, where `*` is zero or more batch dimensions,
 and the inner-most 2 dimensions form square matrices.
@@ -2194,14 +2117,14 @@ e.g., When the input is 2-D, the output is a scalar(shape is empty: `[]`).
 
 ONNX_OPERATOR_SET_SCHEMA(
     Det,
-    11,
+    22,
     OpSchema()
-        .SetDoc(Det_ver11_doc)
+        .SetDoc(Det_ver22_doc)
         .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .TypeConstraint(
             "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            OpSchema::all_float_types_ir4(),
             "Constrain input and output types to floating-point tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           // Type inference
@@ -2235,110 +2158,6 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
-static const char* NegativeLogLikelihoodLoss_ver13_doc = R"DOC(
-A NegativeLogLikelihoodLoss operator computes (weighted) negative log likelihood loss.
-Its "input" tensor has the shape of (N, C, d1, d2, ..., dk) where k >= 0.
-The "input" tensor contains log-probabilities for input[n, :, d_1, d_2,..., d_k] being in a class of [0, C).
-The operator's "target" input tensor has the shape of (N, d1, d2, ..., dk). It encodes class labels (one of C classes)
-or it may contain a special value (indicated by an attribute ignore_index) for N x d1 x d2 x ... x dk samples.
-The loss value for input[n, :, d_1, d_2,...d_k] being classified as class c = target[n][d_1][d_2]...[d_k] is computed as:
-
-```
-loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k].
-```
-
-When an optional "weight" is provided, the sample loss is calculated as:
-
-```
-loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k] * weight[c].
-```
-
-loss is zero for the case when target-value equals ignore_index.
-
-```
-loss[n][d_1][d_2]...[d_k] = 0, when target[n][d_1][d_2]...[d_k] = ignore_index
-```
-
-If "reduction" attribute is set to "none", the operator's output will be the above loss with shape (N, d1, d2, ..., dk).
-If "reduction" attribute is set to "mean" (the default attribute value), the output loss is (weight) averaged:
-
-```
-mean(loss), if "weight" is not provided,
-```
-
-or if weight is provided,
-
-```
-sum(loss) / sum(weight[target[n][d_1][d_2]...[d_k]]]), for all samples.
-```
-
-If "reduction" attribute is set to "sum", the output is a scalar: `sum(loss)`.
-
-See also https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
-
-Example 1:
-
-```
-// negative log likelihood loss, "none" reduction
-N, C, d1 = 2, 3, 2
-input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
-          [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
-target = [[2, 1], [0, 2]]
-
-loss = np.zeros((N, d1))
-for n in range(N):
-    for d_1 in range(d1):
-        c = target[n][d_1]
-        loss[n][d_1] = -input[n][c][d_1]
-
-// print(loss)
-// [[-3. -2.]
-//  [-0. -2.]]
-```
-
-Example 2:
-
-```
-// weighted negative log likelihood loss, sum reduction
-N, C, d1 = 2, 3, 2
-input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
-        [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
-target = [[2, 1], [0, 2]]
-weight = [0.2, 0.3, 0.1]
-loss = np.zeros((N, d1))
-for n in range(N):
-    for d_1 in range(d1):
-        c = target[n][d_1]
-        loss[n][d_1] = -input[n][c][d_1] * weight[c]
-
-loss = np.sum(loss)
-// print(loss)
-// -1.1
-```
-
-Example 3:
-
-```
-// weighted negative log likelihood loss, mean reduction
-N, C, d1 = 2, 3, 2
-input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
-        [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
-target = [[2, 1], [0, 2]]
-weight = [0.2, 0.3, 0.1]
-loss = np.zeros((N, d1))
-weight_total = 0
-for n in range(N):
-    for d_1 in range(d1):
-        c = target[n][d_1]
-        loss[n][d_1] = -input[n][c][d_1] * weight[c]
-        weight_total = weight_total + weight[c]
-
-loss = np.sum(loss) / weight_total
-// print(loss)
-// -1.57
-```
-)DOC";
-
 bool BuildContextDependentFunctionBody(
     const FunctionBodyBuildContext& ctx,
     const OpSchema& schema,
@@ -2451,11 +2270,115 @@ bool BuildContextDependentFunctionBody(
   return true;
 }
 
+static const char* NegativeLogLikelihoodLoss_ver22_doc = R"DOC(
+A NegativeLogLikelihoodLoss operator computes (weighted) negative log likelihood loss.
+Its "input" tensor has the shape of (N, C, d1, d2, ..., dk) where k >= 0.
+The "input" tensor contains log-probabilities for input[n, :, d_1, d_2,..., d_k] being in a class of [0, C).
+The operator's "target" input tensor has the shape of (N, d1, d2, ..., dk). It encodes class labels (one of C classes)
+or it may contain a special value (indicated by an attribute ignore_index) for N x d1 x d2 x ... x dk samples.
+The loss value for input[n, :, d_1, d_2,...d_k] being classified as class c = target[n][d_1][d_2]...[d_k] is computed as:
+
+```
+loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k].
+```
+
+When an optional "weight" is provided, the sample loss is calculated as:
+
+```
+loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k] * weight[c].
+```
+
+loss is zero for the case when target-value equals ignore_index.
+
+```
+loss[n][d_1][d_2]...[d_k] = 0, when target[n][d_1][d_2]...[d_k] = ignore_index
+```
+
+If "reduction" attribute is set to "none", the operator's output will be the above loss with shape (N, d1, d2, ..., dk).
+If "reduction" attribute is set to "mean" (the default attribute value), the output loss is (weight) averaged:
+
+```
+mean(loss), if "weight" is not provided,
+```
+
+or if weight is provided,
+
+```
+sum(loss) / sum(weight[target[n][d_1][d_2]...[d_k]]]), for all samples.
+```
+
+If "reduction" attribute is set to "sum", the output is a scalar: `sum(loss)`.
+
+See also https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
+
+Example 1:
+
+```
+// negative log likelihood loss, "none" reduction
+N, C, d1 = 2, 3, 2
+input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+          [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+target = [[2, 1], [0, 2]]
+
+loss = np.zeros((N, d1))
+for n in range(N):
+    for d_1 in range(d1):
+        c = target[n][d_1]
+        loss[n][d_1] = -input[n][c][d_1]
+
+// print(loss)
+// [[-3. -2.]
+//  [-0. -2.]]
+```
+
+Example 2:
+
+```
+// weighted negative log likelihood loss, sum reduction
+N, C, d1 = 2, 3, 2
+input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+        [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+target = [[2, 1], [0, 2]]
+weight = [0.2, 0.3, 0.1]
+loss = np.zeros((N, d1))
+for n in range(N):
+    for d_1 in range(d1):
+        c = target[n][d_1]
+        loss[n][d_1] = -input[n][c][d_1] * weight[c]
+
+loss = np.sum(loss)
+// print(loss)
+// -1.1
+```
+
+Example 3:
+
+```
+// weighted negative log likelihood loss, mean reduction
+N, C, d1 = 2, 3, 2
+input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+        [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+target = [[2, 1], [0, 2]]
+weight = [0.2, 0.3, 0.1]
+loss = np.zeros((N, d1))
+weight_total = 0
+for n in range(N):
+    for d_1 in range(d1):
+        c = target[n][d_1]
+        loss[n][d_1] = -input[n][c][d_1] * weight[c]
+        weight_total = weight_total + weight[c]
+
+loss = np.sum(loss) / weight_total
+// print(loss)
+// -1.57
+```
+)DOC";
+
 ONNX_OPERATOR_SET_SCHEMA(
     NegativeLogLikelihoodLoss,
-    13,
+    22,
     OpSchema()
-        .SetDoc(NegativeLogLikelihoodLoss_ver13_doc)
+        .SetDoc(NegativeLogLikelihoodLoss_ver22_doc)
         .Input(
             0,
             "input",
@@ -2502,7 +2425,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             false)
         .TypeConstraint(
             "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            OpSchema::all_float_types_ir4(),
             "Constrain input, weight, and output types to floating-point tensors.")
         .TypeConstraint("Tind", {"tensor(int32)", "tensor(int64)"}, "Constrain target to integer types")
         .SetContextDependentFunctionBodyBuilder(BuildContextDependentFunctionBody)
diff --git a/onnx/defs/math/old.cc b/onnx/defs/math/old.cc
index 00048188888..c16e0419e2e 100644
--- a/onnx/defs/math/old.cc
+++ b/onnx/defs/math/old.cc
@@ -4,6 +4,7 @@
 
 #include <functional>
 
+#include "onnx/defs/data_type_utils.h"
 #include "onnx/defs/function.h"
 #include "onnx/defs/math/utils.h"
 #include "onnx/defs/schema.h"
@@ -11,6 +12,978 @@
 
 namespace ONNX_NAMESPACE {
 
+bool BuildContextDependentFunctionBody_opset13(
+    const FunctionBodyBuildContext& ctx,
+    const OpSchema& schema,
+    FunctionProto& functionProto) {
+  if (ctx.getInputType(0) == nullptr) {
+    // we cannot create a correct function body without knowing the input type
+    return false;
+  }
+  auto input_type = ctx.getInputType(0)->tensor_type().elem_type();
+  bool float_input = input_type == TensorProto_DataType_FLOAT;
+  auto reduction_attr_proto = ctx.getAttribute("reduction");
+  std::string reduction_attr =
+      reduction_attr_proto != nullptr && reduction_attr_proto->has_s() ? reduction_attr_proto->s() : "mean";
+
+  FunctionBuilder builder(functionProto);
+  builder.Const1D("const_zero", int64_t(0))
+      .Const1D("const_one", int64_t(1))
+      .Const1D("axes", int64_t(1))
+      .Add("expanded_target = Unsqueeze (target, axes)");
+
+  if (ctx.getAttribute("ignore_index") == nullptr) {
+    builder.Add(R"(
+      input_gather_element = GatherElements <axis = 1> (input, expanded_target)
+      loss_NCdd = Neg (input_gather_element)
+      loss_N1dd = Slice (loss_NCdd, const_zero, const_one, const_one)
+    )");
+
+    if (!ctx.hasInput(2)) {
+      if (reduction_attr == "none") {
+        builder.Add("loss = Squeeze (loss_N1dd, axes)");
+      } else {
+        builder.Add("loss_Ndd = Squeeze (loss_N1dd, axes)");
+        if (reduction_attr == "mean") {
+          builder.Add("loss = ReduceMean <keepdims = 0> (loss_Ndd)");
+        } else {
+          builder.Add("loss = ReduceSum <keepdims = 0> (loss_Ndd)");
+        }
+      }
+    } else {
+      builder.Add("weight_gather = Gather (weight, target)");
+      builder.Add("loss_unweighted = Squeeze (loss_N1dd, axes)");
+      if (reduction_attr == "none") {
+        builder.Add("loss = Mul (loss_unweighted, weight_gather)");
+      } else {
+        builder.Add("loss_Ndd = Mul (loss_unweighted, weight_gather)");
+        if (reduction_attr == "mean") {
+          builder.Add(R"(
+            loss_sum = ReduceSum <keepdims = 0> (loss_Ndd)
+            weight_gather_sum = ReduceSum <keepdims = 0> (weight_gather)
+            loss = Div (loss_sum, weight_gather_sum)
+          )");
+        } else {
+          builder.Add("loss = ReduceSum <keepdims = 0> (loss_Ndd)");
+        }
+      }
+    }
+  } else {
+    builder.Const1D("const_ignore_index", ctx.getAttribute("ignore_index")->i());
+    builder.Add(R"(
+      const_zero_target_typed = Sub (expanded_target, expanded_target)
+      expanded_target_int64 = Cast <to = 7> (expanded_target)
+      mask = Equal (expanded_target_int64, const_ignore_index)
+      transform_targets = Where (mask, const_zero_target_typed, expanded_target)
+    )");
+    builder.Add("input_gather_element = GatherElements <axis = 1> (input, transform_targets)");
+    builder.Const1D("const_zero_float", 0.0f);
+    if (!float_input) {
+      builder.Add("const_zero_casted = Cast (const_zero_float)", "to", static_cast<int64_t>(input_type))
+          .Add("input_gather_element_transform = Where (mask, const_zero_casted, input_gather_element)");
+    } else
+      builder.Add("input_gather_element_transform = Where (mask, const_zero_float, input_gather_element)");
+    builder.Add("loss_NCdd = Neg (input_gather_element_transform)");
+    builder.Add("loss_N1dd = Slice (loss_NCdd, const_zero, const_one, const_one)");
+
+    if (!ctx.hasInput(2)) {
+      builder.Add("squeeze_mask = Squeeze (mask, axes)");
+      builder.Const1D("const_one_float", 1.0f);
+      if (!float_input) {
+        builder.Add("const_one_casted = Cast (const_one_float)", "to", static_cast<int64_t>(input_type))
+            .Add("weight_gather = Where (squeeze_mask, const_zero_casted, const_one_casted)");
+      } else
+        builder.Add("weight_gather = Where (squeeze_mask, const_zero_float, const_one_float)");
+
+    } else {
+      builder.Add("weight_gather_temp = Gather (weight, transform_targets)");
+      builder.Add(
+          float_input ? "weight_gather_temp_1 = Where (mask, const_zero_float, weight_gather_temp)"
+                      : "weight_gather_temp_1 = Where (mask, const_zero_casted, weight_gather_temp)");
+      builder.Add("weight_gather = Squeeze (weight_gather_temp_1, axes)");
+    }
+
+    builder.Add("loss_unweighted = Squeeze (loss_N1dd, axes)");
+    if (reduction_attr == "none") {
+      builder.Add("loss = Mul (loss_unweighted, weight_gather)");
+    } else {
+      builder.Add("loss_Ndd = Mul (loss_unweighted, weight_gather)");
+      if (reduction_attr == "mean") {
+        builder.Add(R"(
+            loss_sum = ReduceSum <keepdims = 0> (loss_Ndd)
+            weight_gather_sum = ReduceSum <keepdims = 0> (weight_gather)
+            loss = Div (loss_sum, weight_gather_sum)
+        )");
+      } else {
+        builder.Add("loss = ReduceSum <keepdims = 0> (loss_Ndd)");
+      }
+    }
+  }
+
+  schema.BuildFunction(functionProto);
+  return true;
+}
+
+static const char* NegativeLogLikelihoodLoss_ver13_doc = R"DOC(
+A NegativeLogLikelihoodLoss operator computes (weighted) negative log likelihood loss.
+Its "input" tensor has the shape of (N, C, d1, d2, ..., dk) where k >= 0.
+The "input" tensor contains log-probabilities for input[n, :, d_1, d_2,..., d_k] being in a class of [0, C).
+The operator's "target" input tensor has the shape of (N, d1, d2, ..., dk). It encodes class labels (one of C classes)
+or it may contain a special value (indicated by an attribute ignore_index) for N x d1 x d2 x ... x dk samples.
+The loss value for input[n, :, d_1, d_2,...d_k] being classified as class c = target[n][d_1][d_2]...[d_k] is computed as:
+
+```
+loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k].
+```
+
+When an optional "weight" is provided, the sample loss is calculated as:
+
+```
+loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k] * weight[c].
+```
+
+loss is zero for the case when target-value equals ignore_index.
+
+```
+loss[n][d_1][d_2]...[d_k] = 0, when target[n][d_1][d_2]...[d_k] = ignore_index
+```
+
+If "reduction" attribute is set to "none", the operator's output will be the above loss with shape (N, d1, d2, ..., dk).
+If "reduction" attribute is set to "mean" (the default attribute value), the output loss is (weight) averaged:
+
+```
+mean(loss), if "weight" is not provided,
+```
+
+or if weight is provided,
+
+```
+sum(loss) / sum(weight[target[n][d_1][d_2]...[d_k]]]), for all samples.
+```
+
+If "reduction" attribute is set to "sum", the output is a scalar: `sum(loss)`.
+
+See also https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
+
+Example 1:
+
+```
+// negative log likelihood loss, "none" reduction
+N, C, d1 = 2, 3, 2
+input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+          [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+target = [[2, 1], [0, 2]]
+
+loss = np.zeros((N, d1))
+for n in range(N):
+    for d_1 in range(d1):
+        c = target[n][d_1]
+        loss[n][d_1] = -input[n][c][d_1]
+
+// print(loss)
+// [[-3. -2.]
+//  [-0. -2.]]
+```
+
+Example 2:
+
+```
+// weighted negative log likelihood loss, sum reduction
+N, C, d1 = 2, 3, 2
+input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+        [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+target = [[2, 1], [0, 2]]
+weight = [0.2, 0.3, 0.1]
+loss = np.zeros((N, d1))
+for n in range(N):
+    for d_1 in range(d1):
+        c = target[n][d_1]
+        loss[n][d_1] = -input[n][c][d_1] * weight[c]
+
+loss = np.sum(loss)
+// print(loss)
+// -1.1
+```
+
+Example 3:
+
+```
+// weighted negative log likelihood loss, mean reduction
+N, C, d1 = 2, 3, 2
+input = [[[1.0, 2.0], [2.0, 2.0], [3.0, 2.0]],
+        [[0.0, 1.0], [2.0, 2.0], [1.0, 2]]]
+target = [[2, 1], [0, 2]]
+weight = [0.2, 0.3, 0.1]
+loss = np.zeros((N, d1))
+weight_total = 0
+for n in range(N):
+    for d_1 in range(d1):
+        c = target[n][d_1]
+        loss[n][d_1] = -input[n][c][d_1] * weight[c]
+        weight_total = weight_total + weight[c]
+
+loss = np.sum(loss) / weight_total
+// print(loss)
+// -1.57
+```
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    NegativeLogLikelihoodLoss,
+    13,
+    OpSchema()
+        .SetDoc(NegativeLogLikelihoodLoss_ver13_doc)
+        .Input(
+            0,
+            "input",
+            "Input tensor of shape (N, C) or (N, C, d1, d2, ..., dk).",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            1,
+            "target",
+            "Target tensor of shape (N) or (N, d1, d2, ..., dk). Target element value shall be in range of [0, C). "
+            "If ignore_index is specified, it may have a value outside [0, C) and the target values should either be "
+            "in the range [0, C) or have the value ignore_index.",
+            "Tind",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            2,
+            "weight",
+            "Optional rescaling weight tensor. "
+            "If given, it has to be a tensor of size C. Otherwise, it is treated as if having all ones.",
+            "T",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(0, "loss", "The negative log likelihood loss", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Attr(
+            "reduction",
+            "Type of reduction to apply to loss: none, sum, mean (default). "
+            "'none': the output is the loss for each sample. "
+            "'sum': the output will be summed. "
+            "'mean': the sum of the output will be divided by the sum of applied weights.",
+            AttributeProto::STRING,
+            std::string("mean"))
+        .Attr(
+            "ignore_index",
+            "Specifies a target value that is ignored and does not contribute to the input gradient. It's an optional value.",
+            AttributeProto::INT,
+            false)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input, weight, and output types to floating-point tensors.")
+        .TypeConstraint("Tind", {"tensor(int32)", "tensor(int64)"}, "Constrain target to integer types")
+        .SetContextDependentFunctionBodyBuilder(BuildContextDependentFunctionBody_opset13)
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          // Type inference
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+          // Shape inference
+          if (hasNInputShapes(ctx, 2)) {
+            const TensorShapeProto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+            const TensorShapeProto& target_shape = ctx.getInputType(1)->tensor_type().shape();
+
+            const int input_rank = static_cast<int>(input_shape.dim_size());
+            const int target_rank = static_cast<int>(target_shape.dim_size());
+
+            if (input_rank < 2) {
+              fail_shape_inference("Input rank must be >= 2.")
+            }
+            if (target_rank != input_rank - 1) {
+              fail_shape_inference("Target rank must be 1 less than the input rank.");
+            }
+
+            // match input dimensions (N, C, d1, ..., dk) with target
+            // dimensions of (C, d1, ..., dk)
+            for (int dim = 0; dim < target_rank; dim++) {
+              const auto input_dim = dim == 0 ? input_shape.dim(dim) : input_shape.dim(dim + 1);
+              const auto target_dim = target_shape.dim(dim);
+              if (input_dim.has_dim_value() && target_dim.has_dim_value() &&
+                  input_dim.dim_value() != target_dim.dim_value())
+                fail_shape_inference("Input and target dimension value mismatch.");
+            }
+
+            if (ctx.getNumInputs() == 3 && hasInputShape(ctx, 2)) {
+              const TensorShapeProto& weight_shape = ctx.getInputType(2)->tensor_type().shape();
+              if (weight_shape.dim_size() != 1) {
+                fail_shape_inference("Weight rank must be 1.");
+              }
+            }
+
+            TensorShapeProto* output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+            if (getAttribute(ctx, "reduction", "mean") == "none") {
+              // output tensor is of shape (N, d1, d2, ..., dk) if
+              // reduction attribute is "none".
+              for (int i = 0; i < input_rank - 1; i++) {
+                auto* dim = output_shape->add_dim();
+                if (i == 0)
+                  *dim = input_shape.dim(i);
+                else
+                  *dim = input_shape.dim(i + 1);
+              }
+            }
+            // otherwise output is a scalar.
+          }
+        }));
+
+static const char* Det_ver11_doc = R"DOC(
+Det calculates determinant of a square matrix or batches of square matrices.
+Det takes one input tensor of shape `[*, M, M]`, where `*` is zero or more batch dimensions,
+and the inner-most 2 dimensions form square matrices.
+The output is a tensor of shape `[*]`, containing the determinants of all input submatrices.
+e.g., When the input is 2-D, the output is a scalar(shape is empty: `[]`).
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Det,
+    11,
+    OpSchema()
+        .SetDoc(Det_ver11_doc)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to floating-point tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          // Type inference
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+          // Shape inference
+          if (hasInputShape(ctx, 0)) {
+            const TensorShapeProto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+            TensorShapeProto* output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+            const int rank = static_cast<int>(input_shape.dim_size());
+
+            if (rank < 2) {
+              fail_shape_inference("Input rank must be >= 2.");
+            }
+
+            const auto mat_w = input_shape.dim(rank - 1);
+            const auto mat_h = input_shape.dim(rank - 2);
+            if (mat_w.has_dim_value() && mat_h.has_dim_value() && (mat_w.dim_value() != mat_h.dim_value())) {
+              fail_shape_inference(
+                  "The inner-most 2 dimensions must have the same size (mat_w:",
+                  mat_w.dim_value(),
+                  " != mat_h:",
+                  mat_h.dim_value(),
+                  ").");
+            }
+
+            for (int i = 0; i < rank - 2; ++i) {
+              auto* dim = output_shape->add_dim();
+              *dim = input_shape.dim(i);
+            }
+          }
+        }));
+
+static const char* Round_ver11_doc = R"DOC(
+Round takes one input Tensor and rounds the values, element-wise, meaning
+it finds the nearest integer for each value.
+In case of halves, the rule is to round them to the nearest even integer.
+If input x is integral, +0, -0, NaN,  or infinite, x itself is returned.
+The output tensor has the same shape and type as the input.
+
+Examples:
+```
+round([0.9]) = [1.0]
+round([2.5]) = [2.0]
+round([2.3]) = [2.0]
+round([1.5]) = [2.0]
+round([-4.5]) = [-4.0]
+```
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Round,
+    11,
+    OpSchema()
+        .SetDoc(Round_ver11_doc)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Atanh_ver9_doc = R"DOC(
+Calculates the hyperbolic arctangent of the given input tensor element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Atanh,
+    9,
+    OpSchema()
+        .SetDoc(Atanh_ver9_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The hyperbolic arctangent values of the input tensor "
+            "computed element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Acosh_ver9_doc = R"DOC(
+Calculates the hyperbolic arccosine of the given input tensor element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Acosh,
+    9,
+    OpSchema()
+        .SetDoc(Acosh_ver9_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The hyperbolic arccosine values of the input tensor "
+            "computed element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Asinh_ver9_doc = R"DOC(
+Calculates the hyperbolic arcsine of the given input tensor element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Asinh,
+    9,
+    OpSchema()
+        .SetDoc(Asinh_ver9_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The hyperbolic arcsine values of the input tensor "
+            "computed element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Cosh_ver9_doc = R"DOC(
+Calculates the hyperbolic cosine of the given input tensor element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Cosh,
+    9,
+    OpSchema()
+        .SetDoc(Cosh_ver9_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The hyperbolic cosine values of the input tensor "
+            "computed element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Sinh_ver9_doc = R"DOC(
+Calculates the hyperbolic sine of the given input tensor element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Sinh,
+    9,
+    OpSchema()
+        .SetDoc(Sinh_ver9_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The hyperbolic sine values of the input tensor "
+            "computed element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Atan_ver7_doc = R"DOC(
+Calculates the arctangent (inverse of tangent) of the given input tensor, element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Atan,
+    7,
+    OpSchema()
+        .SetDoc(Atan_ver7_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The arctangent of the input tensor computed "
+            "element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Acos_ver7_doc = R"DOC(
+Calculates the arccosine (inverse of cosine) of the given input tensor, element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Acos,
+    7,
+    OpSchema()
+        .SetDoc(Acos_ver7_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The arccosine of the input tensor computed "
+            "element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Asin_ver7_doc = R"DOC(
+Calculates the arcsine (inverse of sine) of the given input tensor, element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Asin,
+    7,
+    OpSchema()
+        .SetDoc(Asin_ver7_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The arcsine of the input tensor computed "
+            "element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Tan_ver7_doc = R"DOC(
+Calculates the tangent of the given input tensor, element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Tan,
+    7,
+    OpSchema()
+        .SetDoc(Tan_ver7_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The tangent of the input tensor computed "
+            "element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Cos_ver7_doc = R"DOC(
+Calculates the cosine of the given input tensor, element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Cos,
+    7,
+    OpSchema()
+        .SetDoc(Cos_ver7_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The cosine of the input tensor computed "
+            "element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Sin_ver7_doc = R"DOC(
+Calculates the sine of the given input tensor, element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Sin,
+    7,
+    OpSchema()
+        .SetDoc(Sin_ver7_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The sine of the input tensor computed "
+            "element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Softplus_ver1_doc = R"DOC(
+Softplus takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the softplus function, y = ln(exp(x) + 1), is applied to
+the tensor elementwise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Softplus,
+    1,
+    OpSchema()
+        .SetDoc(Softplus_ver1_doc)
+        .Input(0, "X", "1D input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "1D input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
+        .FunctionBody(
+            R"ONNX(
+            {
+              exp_x = Exp (X)
+              one = Constant <value = float {1.0}>()
+              one_cast = CastLike (one, X)
+              exp_x_add_one = Add (exp_x, one_cast)
+              Y = Log (exp_x_add_one)
+            }
+            )ONNX",
+            18));
+
+static const char* Softsign_ver1_doc = R"DOC(
+Calculates the softsign (x/(1+|x|)) of the given input tensor element-wise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Softsign,
+    1,
+    OpSchema()
+        .SetDoc(Softsign_ver1_doc)
+        .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The softsign (x/(1+|x|)) values of the input tensor computed element-wise",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
+        .FunctionBody(
+            R"ONNX(
+          {
+            One = Constant <value = float {1.0}>()
+            OneCast = CastLike (One, input)
+            AbsInput = Abs(input)
+            OneAddAbsInput = Add (OneCast, AbsInput)
+            output = Div(input, OneAddAbsInput)
+          }
+        )ONNX",
+            18));
+
+static const char* HardSwish_ver14_doc = R"DOC(
+HardSwish takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where
+the HardSwish function, y = x * max(0, min(1, alpha * x + beta)) = x * HardSigmoid<alpha, beta>(x),
+where alpha = 1/6 and beta = 0.5, is applied to the tensor elementwise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    HardSwish,
+    14,
+    OpSchema()
+        .SetDoc(HardSwish_ver14_doc)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
+        .FunctionBody(R"ONNX(
+          {
+            HS_X = HardSigmoid<alpha = 0.16666667163372, beta = 0.5>(X)
+            Y = Mul (X, HS_X)
+          }
+        )ONNX"));
+
+static const char* HardSigmoid_ver6_doc = R"DOC(
+HardSigmoid takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the HardSigmoid function, y = max(0, min(1, alpha * x + beta)),
+is applied to the tensor elementwise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    HardSigmoid,
+    6,
+    OpSchema()
+        .Attr("alpha", "Value of alpha.", AttributeProto::FLOAT, 0.2f)
+        .Attr("beta", "Value of beta.", AttributeProto::FLOAT, 0.5f)
+        .SetDoc(HardSigmoid_ver6_doc)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
+        .FunctionBody(
+            R"ONNX(
+          {
+            Alpha = Constant <value_float: float = @alpha>()
+            AlphaCast = CastLike (Alpha, X)
+            Beta = Constant <value_float: float = @beta>()
+            BetaCast = CastLike (Beta, X)
+            Zero = Constant <value = float {0.0}>()
+            ZeroCast = CastLike (Zero, X)
+            One = Constant <value = float {1.0}>()
+            OneCast = CastLike (One, X)
+            AlphaMulX = Mul (X, AlphaCast)
+            AlphaMulXAddBeta = Add (AlphaMulX, BetaCast)
+            MinOneOrAlphaMulXAddBeta = Min (AlphaMulXAddBeta, OneCast)
+            Y = Max(MinOneOrAlphaMulXAddBeta, ZeroCast)
+          }
+        )ONNX",
+            18));
+
+static const char* mish_ver18_doc = R"DOC(
+Mish: A Self Regularized Non-Monotonic Neural Activation Function.
+
+Perform the linear unit element-wise on the input tensor X using formula:
+
+```
+mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
+```
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Mish,
+    18,
+    OpSchema()
+        .SetDoc(mish_ver18_doc)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input X and output types to float tensors.")
+        .FunctionBody(R"ONNX(
+          {
+            Softplus_X = Softplus (X)
+            TanHSoftplusX = Tanh (Softplus_X)
+            Y = Mul (X, TanHSoftplusX)
+           }
+        )ONNX")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
+static const char* Elu_ver6_doc = R"DOC(
+Elu takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the function `f(x) = alpha * (exp(x) - 1.) for x <
+0`, `f(x) = x for x >= 0`., is applied to the tensor elementwise.
+
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Elu,
+    6,
+    OpSchema()
+        .Attr("alpha", "Coefficient of ELU.", AttributeProto::FLOAT, 1.0f)
+        .SetDoc(Elu_ver6_doc)
+        .Input(0, "X", "1D input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "1D output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
+        .FunctionBody(
+            R"ONNX(
+          {
+            Alpha = Constant <value_float: float = @alpha>()
+            AlphaCast = CastLike (Alpha, X)
+            Zero = Constant <value = float {0.0}>()
+            ZeroCast = CastLike (Zero, X)
+            One = Constant <value = float {1.0}>()
+            OneCast = CastLike (One, X)
+            XLessThanZero = Less (X, ZeroCast)
+            ExpX = Exp (X)
+            ExpXSubOne = Sub (ExpX, OneCast)
+            AlphaMulExpXSubOne = Mul (AlphaCast, ExpXSubOne)
+            Y = Where(XLessThanZero, AlphaMulExpXSubOne, X)
+          }
+        )ONNX",
+            18));
+
+static const char* Selu_ver6_doc = R"DOC(
+Selu takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the scaled exponential linear unit function,
+`y = gamma * (alpha * e^x - alpha) for x <= 0`, `y = gamma * x for x > 0`,
+is applied to the tensor elementwise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Selu,
+    6,
+    OpSchema()
+        .Attr(
+            "alpha",
+            "Coefficient of SELU default to 1.67326319217681884765625 "
+            "(i.e., float32 approximation of 1.6732632423543772848170429916717).",
+            AttributeProto::FLOAT,
+            1.67326319217681884765625f)
+        .Attr(
+            "gamma",
+            "Coefficient of SELU default to 1.05070102214813232421875 "
+            "(i.e., float32 approximation of 1.0507009873554804934193349852946).",
+            AttributeProto::FLOAT,
+            1.05070102214813232421875f)
+        .SetDoc(Selu_ver6_doc)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
+        .FunctionBody(
+            R"ONNX(
+          {
+            Alpha = Constant <value_float: float = @alpha>()
+            AlphaCast = CastLike (Alpha, X)
+            Gamma = Constant <value_float: float = @gamma>()
+            GammaCast = CastLike (Gamma, X)
+            Zero = Constant <value = float {0.0}>()
+            ZeroCast = CastLike (Zero, X)
+            ExpX = Exp (X)
+            AlphaMulExpX = Mul(AlphaCast, ExpX)
+            AlphaMulExpXSubAlpha = Sub (AlphaMulExpX, AlphaCast)
+            Neg = Mul (GammaCast, AlphaMulExpXSubAlpha)
+            Pos = Mul (GammaCast, X)
+            XLessThanZero = Less (X, ZeroCast)
+            Y = Where(XLessThanZero, Neg, Pos)
+          }
+        )ONNX",
+            18));
+
+static const char* ThresholdedRelu_ver10_doc = R"DOC(
+ThresholdedRelu takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the rectified linear function, y = x for x > alpha, y = 0 otherwise,
+is applied to the tensor elementwise.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    ThresholdedRelu,
+    10,
+    OpSchema()
+        .SetDoc(ThresholdedRelu_ver10_doc)
+        .Attr("alpha", "Threshold value", AttributeProto::FLOAT, 1.0f)
+        .Input(0, "X", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "Y", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput)
+        .FunctionBody(
+            R"ONNX(
+          {
+            Alpha = Constant <value_float: float = @alpha>()
+            AlphaCast = CastLike (Alpha, X)
+            Zero = Constant <value = float {0.0}>()
+            ZeroCast = CastLike (Zero, X)
+            AlphaLessThanX = Less(AlphaCast, X)
+            Y = Where(AlphaLessThanX, X, ZeroCast)
+          }
+        )ONNX",
+            18));
+
 std::function<void(OpSchema&)> MathDocGenerator_opset13(const char* name) {
   return [=](OpSchema& schema) {
     std::string doc;
diff --git a/onnx/defs/math/utils.cc b/onnx/defs/math/utils.cc
index e2c6b809673..6b96bace164 100644
--- a/onnx/defs/math/utils.cc
+++ b/onnx/defs/math/utils.cc
@@ -11,6 +11,17 @@ namespace defs {
 namespace math {
 namespace utils {
 
+int MathOpTwoIntegers(std::string op_type, int a, int b) {
+  if (op_type == "Add") {
+    return a + b;
+  } else if (op_type == "Sub") {
+    return a - b;
+  } else if (op_type == "Mul") {
+    return a * b;
+  }
+  fail_shape_inference("Wrong op_type name for running propagation: ", op_type);
+}
+
 void MatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx) {
   if (!hasInputShape(ctx, input1Idx) || !hasInputShape(ctx, input2Idx)) {
     return;
diff --git a/onnx/defs/math/utils.h b/onnx/defs/math/utils.h
index 9bbb66be7c6..1151c55ef24 100644
--- a/onnx/defs/math/utils.h
+++ b/onnx/defs/math/utils.h
@@ -40,6 +40,8 @@ void QLinearMatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx);
 
 const char* QLinearMatMulDoc();
 
+int MathOpTwoIntegers(std::string op_type, int a, int b);
+
 } // namespace utils
 } // namespace math
 } // namespace defs
diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
index 9757edb3a69..be6a851dc25 100644
--- a/onnx/defs/nn/defs.cc
+++ b/onnx/defs/nn/defs.cc
@@ -192,9 +192,9 @@ void convPoolShapeInference(
 
 std::vector<std::string> GetSupportedDataTypesForPoolingOps(bool supports8bit) {
   if (supports8bit) {
-    return {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(int8)", "tensor(uint8)"};
+    return OpSchema::all_float_types_plus_Xint8_ir4();
   }
-  return {"tensor(float16)", "tensor(float)", "tensor(double)"};
+  return OpSchema::all_float_types_ir4();
 }
 
 std::function<void(OpSchema&)> PoolOpSchemaGenerator(
@@ -313,7 +313,7 @@ std::function<void(OpSchema&)> PoolOpSchemaGenerator(
 
 ONNX_OPERATOR_SET_SCHEMA(
     AveragePool,
-    19,
+    22,
     OpSchema()
         .FillUsing(PoolOpSchemaGenerator(
             "AveragePool",
@@ -334,7 +334,7 @@ ONNX_OPERATOR_SET_SCHEMA(
 
 ONNX_OPERATOR_SET_SCHEMA(
     MaxPool,
-    12,
+    22,
     OpSchema()
         .FillUsing(PoolOpSchemaGenerator(
             "MaxPool",
@@ -454,7 +454,7 @@ void maxUnpoolShapeInference(InferenceContext& ctx) {
   }
 }
 
-static const char* MaxUnpool_ver11_doc = R"DOC(
+static const char* MaxUnpool_ver22_doc = R"DOC(
 MaxUnpool essentially computes the partial inverse of the MaxPool op.
  The input information to this op is typically the output information from a MaxPool op. The first
  input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
@@ -477,9 +477,9 @@ In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape
 
 ONNX_OPERATOR_SET_SCHEMA(
     MaxUnpool,
-    11,
+    22,
     OpSchema()
-        .SetDoc(MaxUnpool_ver11_doc)
+        .SetDoc(MaxUnpool_ver22_doc)
         .Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS)
         .Attr(
             "strides",
@@ -541,10 +541,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T1",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T1", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeConstraint("T2", {"tensor(int64)"}, "Constrain index tensor to int64")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { maxUnpoolShapeInference(ctx); }));
 
@@ -624,10 +621,7 @@ std::function<void(OpSchema&)> LpPoolOpSchemaGenerator(const char* name) {
         true,
         1,
         OpSchema::Differentiable);
-    schema.TypeConstraint(
-        "T",
-        {"tensor(float16)", "tensor(float)", "tensor(double)"},
-        "Constrain input and output types to float tensors.");
+    schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
     schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
       propagateElemTypeFromInputToOutput(ctx, 0, 0);
       convPoolShapeInference(ctx, true, true, 0, 1);
@@ -635,7 +629,7 @@ std::function<void(OpSchema&)> LpPoolOpSchemaGenerator(const char* name) {
   };
 }
 
-ONNX_OPERATOR_SET_SCHEMA(LpPool, 18, OpSchema().FillUsing(LpPoolOpSchemaGenerator("LpPool")));
+ONNX_OPERATOR_SET_SCHEMA(LpPool, 22, OpSchema().FillUsing(LpPoolOpSchemaGenerator("LpPool")));
 
 // For ROI pool operations.
 void roiPoolTypeShapeInference(InferenceContext& ctx) {
@@ -725,15 +719,12 @@ std::function<void(OpSchema&)> RoiPoolOpSchemaGenerator(const char* name) {
         true,
         1,
         OpSchema::Differentiable);
-    schema.TypeConstraint(
-        "T",
-        {"tensor(float16)", "tensor(float)", "tensor(double)"},
-        "Constrain input and output types to float tensors.");
+    schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
     schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { roiPoolTypeShapeInference(ctx); });
   };
 }
 
-ONNX_OPERATOR_SET_SCHEMA(MaxRoiPool, 1, OpSchema().FillUsing(RoiPoolOpSchemaGenerator("max")));
+ONNX_OPERATOR_SET_SCHEMA(MaxRoiPool, 22, OpSchema().FillUsing(RoiPoolOpSchemaGenerator("max")));
 
 std::function<void(OpSchema&)> ConvOpSchemaGenerator(const char* filter_desc) {
   return [=](OpSchema& schema) {
@@ -806,10 +797,7 @@ computes the output.)DOC";
         true,
         1,
         OpSchema::Differentiable);
-    schema.TypeConstraint(
-        "T",
-        {"tensor(float16)", "tensor(float)", "tensor(double)"},
-        "Constrain input and output types to float tensors.");
+    schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
     schema.Attr(
         "kernel_shape",
         "The shape of the convolution kernel. If not present, should be inferred from input W.",
@@ -839,7 +827,7 @@ computes the output.)DOC";
   };
 }
 
-ONNX_OPERATOR_SET_SCHEMA(Conv, 11, OpSchema().FillUsing(ConvOpSchemaGenerator("a filter")));
+ONNX_OPERATOR_SET_SCHEMA(Conv, 22, OpSchema().FillUsing(ConvOpSchemaGenerator("a filter")));
 
 static const char* QLinearConv_ver10_doc = R"DOC(
 The convolution operator consumes a quantized input tensor, its scale and zero point,
@@ -1322,10 +1310,7 @@ output_shape can also be explicitly specified in which case pads values are auto
         true,
         1,
         OpSchema::Differentiable);
-    schema.TypeConstraint(
-        "T",
-        {"tensor(float16)", "tensor(float)", "tensor(double)"},
-        "Constrain input and output types to float tensors.");
+    schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
     schema.Attr(
         "kernel_shape",
         "The shape of the convolution kernel. If not present, should be inferred from input W.",
@@ -1373,18 +1358,18 @@ output_shape can also be explicitly specified in which case pads values are auto
   };
 }
 
-ONNX_OPERATOR_SET_SCHEMA(ConvTranspose, 11, OpSchema().FillUsing(ConvTransposeOpSchemaGenerator("a filter")));
+ONNX_OPERATOR_SET_SCHEMA(ConvTranspose, 22, OpSchema().FillUsing(ConvTransposeOpSchemaGenerator("a filter")));
 
-static const char* DeformConv_ver19_doc = R"DOC(
+static const char* DeformConv_ver22_doc = R"DOC(
 Performs deformable convolution as described in https://arxiv.org/abs/1703.06211 and https://arxiv.org/abs/1811.11168.
 This operator specification supports the general N-D case. Note that most common use cases have 2D or 3D data.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     DeformConv,
-    19,
+    22,
     OpSchema()
-        .SetDoc(DeformConv_ver19_doc)
+        .SetDoc(DeformConv_ver22_doc)
         .Input(
             0,
             "X",
@@ -1429,10 +1414,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "Output data tensor that contains the result of convolution. It has shape (N, oC, oH, oW) "
             "for 2D data or (N, oC, o1, o2, ..., on) for nD data",
             "T")
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .Attr(
             "dilations",
             "Dilation value along each spatial axis of the kernel. Default is 1 along each axis.",
@@ -1537,18 +1519,15 @@ std::function<void(OpSchema&)> GlobalPoolingOpSchemaGenerator(const char* op_typ
         true,
         1,
         OpSchema::Differentiable);
-    schema.TypeConstraint(
-        "T",
-        {"tensor(float16)", "tensor(float)", "tensor(double)"},
-        "Constrain input and output types to float tensors.");
+    schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
     schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { globalPoolTypeShapeInference(ctx); });
   };
 }
 ONNX_OPERATOR_SET_SCHEMA(
     GlobalAveragePool,
-    1,
+    22,
     OpSchema().FillUsing(GlobalPoolingOpSchemaGenerator("AveragePool", "average")));
-ONNX_OPERATOR_SET_SCHEMA(GlobalMaxPool, 1, OpSchema().FillUsing(GlobalPoolingOpSchemaGenerator("MaxPool", "max")));
+ONNX_OPERATOR_SET_SCHEMA(GlobalMaxPool, 22, OpSchema().FillUsing(GlobalPoolingOpSchemaGenerator("MaxPool", "max")));
 
 std::function<void(OpSchema&)> GlobalLpPoolingOpSchemaGenerator(const char* op_type, const char* op) {
   return [=](OpSchema& schema) {
@@ -1597,7 +1576,7 @@ std::function<void(OpSchema&)> GlobalLpPoolingOpSchemaGenerator(const char* op_t
   };
 }
 
-ONNX_OPERATOR_SET_SCHEMA(GlobalLpPool, 2, OpSchema().FillUsing(GlobalLpPoolingOpSchemaGenerator("LpPool", "lp pool")));
+ONNX_OPERATOR_SET_SCHEMA(GlobalLpPool, 22, OpSchema().FillUsing(GlobalLpPoolingOpSchemaGenerator("LpPool", "lp pool")));
 
 static const char* BatchNormalization_ver15_doc = R"DOC(
 Carries out batch normalization as described in the paper
@@ -1779,7 +1758,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
-static const char* InstanceNormalization_ver6_doc = R"DOC(
+static const char* InstanceNormalization_ver22_doc = R"DOC(
 Carries out instance normalization as described in the paper
 https://arxiv.org/abs/1607.08022.
 
@@ -1790,9 +1769,9 @@ where mean and variance are computed per instance per channel.
 
 ONNX_OPERATOR_SET_SCHEMA(
     InstanceNormalization,
-    6,
+    22,
     OpSchema()
-        .SetDoc(InstanceNormalization_ver6_doc)
+        .SetDoc(InstanceNormalization_ver22_doc)
         .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
         .Input(
             0,
@@ -1837,27 +1816,21 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { propagateShapeAndTypeFromFirstInput(ctx); }));
 
-static const char* LpNormalization_ver1_doc = R"DOC(
+static const char* LpNormalization_ver22_doc = R"DOC(
 Given a matrix, apply Lp-normalization along the provided axis.
 )DOC";
 
 ONNX_OPERATOR_SET_SCHEMA(
     LpNormalization,
-    1,
+    22,
     OpSchema()
         .Input(0, "input", "Input matrix", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "output", "Matrix after normalization", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input and output types to float tensors.")
-        .SetDoc(LpNormalization_ver1_doc)
+        .TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.")
+        .SetDoc(LpNormalization_ver22_doc)
         .Attr(
             "axis",
             "The axis on which to apply normalization, -1 mean last axis.",
@@ -1870,7 +1843,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             static_cast<int64_t>(2))
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { propagateShapeAndTypeFromFirstInput(ctx); }));
 
-static const char* Dropout_ver13_doc = R"DOC(
+static const char* Dropout_ver22_doc = R"DOC(
 Dropout takes an input floating-point tensor, an optional input ratio (floating-point scalar) and an optional input training_mode (boolean scalar). It produces two tensor outputs,
 output (floating-point tensor) and mask (optional `Tensor<bool>`). If `training_mode` is true then the output Y will be a random dropout;
 Note that this Dropout scales the masked input data by the following equation, so to convert the trained model into inference mode,
@@ -1886,9 +1859,9 @@ scale = 1. / (1. - ratio).
 
 ONNX_OPERATOR_SET_SCHEMA(
     Dropout,
-    13,
+    22,
     OpSchema()
-        .SetDoc(GET_OP_DOC_STR(std::string(Dropout_ver13_doc) + GenerateOptionalArgumentsDoc()))
+        .SetDoc(GET_OP_DOC_STR(std::string(Dropout_ver22_doc) + GenerateOptionalArgumentsDoc()))
         .Attr(
             "seed",
             "(Optional) Seed to the random generator, if not specified we will auto generate one.",
@@ -1920,14 +1893,8 @@ ONNX_OPERATOR_SET_SCHEMA(
             OpSchema::NonDifferentiable)
         .Output(0, "output", "The output.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(1, "mask", "The output mask.", "T2", OpSchema::Optional, true, 1, OpSchema::NonDifferentiable)
-        .TypeConstraint(
-            "T",
-            {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
-            "Constrain input and output types to float tensors.")
-        .TypeConstraint(
-            "T1",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain input 'ratio' types to float tensors.")
+        .TypeConstraint("T", OpSchema::all_float_types_ir10(), "Constrain input and output types to float tensors.")
+        .TypeConstraint("T1", OpSchema::all_float_types_ir10(), "Constrain input 'ratio' types to float tensors.")
         .TypeConstraint("T2", {"tensor(bool)"}, "Constrain output 'mask' types to boolean tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
index 4c12dc94b23..57f8e2a4fdf 100644
--- a/onnx/defs/nn/old.cc
+++ b/onnx/defs/nn/old.cc
@@ -9,6 +9,1537 @@
 
 namespace ONNX_NAMESPACE {
 
+// For GlobalPool operations.
+void globalPoolTypeShapeInference_opset2(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+  // needs at least one input with shape.
+  if (!hasNInputShapes(ctx, 1)) {
+    return;
+  }
+
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  if (input_shape.dim_size() < 2) {
+    return;
+  }
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+
+  // (N, C, 1, 1, ..., 1)
+  auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+  *output_shape->add_dim() = input_shape.dim(0);
+  *output_shape->add_dim() = input_shape.dim(1);
+
+  for (size_t i = 0; i < n_input_dims; ++i) {
+    output_shape->add_dim()->set_dim_value(1);
+  }
+}
+
+std::function<void(OpSchema&)> GlobalLpPoolingOpSchemaGenerator_opset2(const char* op_type, const char* op) {
+  return [=](OpSchema& schema) {
+    std::string doc;
+    POPULATE_OP_DOC_STR(doc = R"DOC(
+ Global{op_type} consumes an input tensor X and applies {op} pooling across
+ the values in the same channel. This is equivalent to {op_type} with kernel size
+ equal to the spatial dimension of input tensor.)DOC";
+                        ReplaceAll(doc, "{op_type}", op_type);
+                        ReplaceAll(doc, "{op}", op););
+    schema.SetDoc(doc);
+    schema.Attr(
+        "p", "p value of the Lp norm used to pool over the input data.", AttributeProto::INT, static_cast<int64_t>(2));
+    schema.Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; "
+        "dimensions for image case are (N x C x H x W), "
+        "where N is the batch size, C is the number of "
+        "channels, and H and W are the height and the width "
+        "of the data. For non image case, the dimensions are "
+        "in the form of (N x C x D1 x D2 ... Dn), "
+        "where N is the batch size.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Output(
+        0,
+        "Y",
+        "Output data tensor from pooling across the input "
+        "tensor. The output tensor has the same rank as the input. "
+        "The first two dimensions of output shape are the same as "
+        "the input (N x C), while the other dimensions are all 1.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { globalPoolTypeShapeInference_opset2(ctx); });
+  };
+}
+
+ONNX_OPERATOR_SET_SCHEMA(
+    GlobalLpPool,
+    2,
+    OpSchema().FillUsing(GlobalLpPoolingOpSchemaGenerator_opset2("LpPool", "lp pool")));
+
+const char* pads_doc_opset11 =
+    "Padding for the beginning and ending along each spatial axis, it can take any value greater "
+    "than or equal to 0. The value represent the number of pixels added to the beginning "
+    "and end part of the corresponding axis. `pads` format should be as follow "
+    "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
+    "added at the beginning of axis `i` and xi_end, the number of pixels added at "
+    "the end of axis `i`. This attribute cannot be used simultaneously with "
+    "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
+const char* conv_auto_pad_doc_opset19 =
+    "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+    "default value is NOTSET, which means explicit padding is used. "
+    "SAME_UPPER or SAME_LOWER mean pad the input so that "
+    "`output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. "
+    "The padding is split between the two sides equally or almost equally (depending "
+    "on whether it is even or odd). In case the padding is an odd number, the extra "
+    "padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.";
+const char* conv_transpose_auto_pad_doc_opset19 =
+    "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+    "default value is NOTSET, which means explicit padding is used. "
+    "SAME_UPPER or SAME_LOWER mean pad the input so that "
+    "`output_shape[i] = input_shape[i] * strides[i]` for each axis `i`. "
+    "The padding is split between the two sides equally or almost equally (depending "
+    "on whether it is even or odd). In case the padding is an odd number, the extra "
+    "padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.";
+
+void convPoolShapeInference_opset19(
+    InferenceContext& ctx,
+    bool use_dilation,
+    bool require_kernel_shape,
+    int input1Idx,
+    int input2Idx) {
+  // we need the first input shape for this inference.
+  if (!hasInputShape(ctx, input1Idx)) {
+    return;
+  }
+
+  // if kernel shape is an input (and not attribute)
+  // we need the shape of the second input.
+  if (!require_kernel_shape && !hasInputShape(ctx, input2Idx)) {
+    return;
+  }
+
+  auto input_shape = ctx.getInputType(input1Idx)->tensor_type().shape();
+  if (input_shape.dim_size() < 2) {
+    fail_shape_inference("Input tensor must have at least 2 dimensions");
+  }
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+
+  // Only MaxPool and Conv support dilation. For
+  // simplicity of the code, we just treat the rest of them as having all-1s
+  // dilation.
+  std::vector<int64_t> dilations;
+  if (use_dilation && getRepeatedAttribute(ctx, "dilations", dilations)) {
+    if (dilations.size() != n_input_dims) {
+      fail_shape_inference("Attribute dilations has incorrect size");
+    }
+  } else {
+    dilations.assign(n_input_dims, 1);
+  }
+
+  std::vector<int64_t> strides;
+  if (getRepeatedAttribute(ctx, "strides", strides)) {
+    if (strides.size() != n_input_dims) {
+      fail_shape_inference("Attribute strides has incorrect size");
+    }
+  } else {
+    strides.assign(n_input_dims, 1);
+  }
+
+  std::vector<int64_t> kernel_shape;
+  if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
+    if (kernel_shape.size() != n_input_dims) {
+      fail_shape_inference("Attribute kernel_shape has incorrect size");
+    }
+  } else if (require_kernel_shape) {
+    fail_shape_inference("Attribute kernel_shape must be specified");
+  } else {
+    auto second_input_shape = ctx.getInputType(input2Idx)->tensor_type().shape();
+    for (int i = 2; i < second_input_shape.dim_size(); ++i) {
+      if (!second_input_shape.dim(i).has_dim_value()) {
+        return;
+      }
+      kernel_shape.push_back(second_input_shape.dim(i).dim_value());
+    }
+  }
+
+  std::vector<int64_t> effective_kernel_shape = kernel_shape;
+  for (int i = 0; i < static_cast<int>(kernel_shape.size()); i++) {
+    // accounting for dilation, how big is the kernel in this dimension
+    effective_kernel_shape[i] = (effective_kernel_shape[i] - 1) * dilations[i] + 1;
+  }
+
+  std::vector<int64_t> pads;
+  if (getRepeatedAttribute(ctx, "pads", pads)) {
+    if (pads.size() != n_input_dims * 2) {
+      fail_shape_inference("Attribute pads has incorrect size");
+    }
+  } else {
+    pads.assign(n_input_dims * 2, 0);
+    const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
+    if ((nullptr != auto_pad_attr) && (auto_pad_attr->s() != "VALID")) {
+      int input_dims_size = static_cast<int>(n_input_dims);
+      for (int i = 0; i < input_dims_size; ++i) {
+        int64_t residual = 0;
+        int64_t stride = strides[i];
+        if (stride > 1) {
+          if (!input_shape.dim(2 + i).has_dim_value()) {
+            continue;
+          }
+          residual = input_shape.dim(2 + i).dim_value();
+          while (residual >= stride) {
+            residual -= stride;
+          }
+        }
+        int64_t total_pad = residual == 0 ? effective_kernel_shape[i] - stride : effective_kernel_shape[i] - residual;
+        if (total_pad < 0)
+          total_pad = 0;
+        int64_t half_pad_small = total_pad >> 1;
+        int64_t half_pad_big = total_pad - half_pad_small;
+        if (auto_pad_attr->s() == "SAME_UPPER") {
+          pads[i] = half_pad_small;
+          pads[i + input_dims_size] = half_pad_big;
+        } else if (auto_pad_attr->s() == "SAME_LOWER") {
+          pads[i] = half_pad_big;
+          pads[i + input_dims_size] = half_pad_small;
+        }
+      }
+    }
+  }
+
+  auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+  if (require_kernel_shape) {
+    // add the first two dimensions from the input.
+    *output_shape->add_dim() = input_shape.dim(0);
+    *output_shape->add_dim() = input_shape.dim(1);
+  } else {
+    *output_shape->add_dim() = input_shape.dim(0);
+    auto& second_input_shape = getInputShape(ctx, input2Idx);
+    if (second_input_shape.dim_size() < 1) {
+      fail_shape_inference("Second input tensor has wrong dimension");
+    }
+    *output_shape->add_dim() = second_input_shape.dim(0);
+  }
+
+  int kernel_shape_size = static_cast<int>(kernel_shape.size());
+  for (int i = 0; i < kernel_shape_size; ++i) {
+    auto newdim = output_shape->add_dim();
+    if (!input_shape.dim(2 + i).has_dim_value()) {
+      continue;
+    }
+    // how big is the input, including padding
+    int64_t effective_input_size = input_shape.dim(2 + i).dim_value();
+    effective_input_size += pads[i];
+    effective_input_size += pads[i + kernel_shape_size];
+
+    // default is floor mode .i.e. ceil_mode is set to 0
+    auto ceil_mode = getAttribute(ctx, "ceil_mode", 0);
+
+    // how many times we can move the kernel from it's initial position, based
+    // on the stride
+    int64_t strided_kernel_positions;
+
+    if (ceil_mode == 1)
+      strided_kernel_positions =
+          (int64_t)(std::ceil((effective_input_size - effective_kernel_shape[i]) / float(strides[i])));
+    else
+      strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) / strides[i];
+
+    // add in the initial position
+    newdim->set_dim_value(1 + strided_kernel_positions);
+  }
+
+  if (ctx.getNumOutputs() > 1) {
+    // MaxPool with two outputs case.
+    auto second_output_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
+    second_output_shape->CopyFrom(*output_shape);
+  }
+}
+
+static const char* Dropout_ver13_doc = R"DOC(
+Dropout takes an input floating-point tensor, an optional input ratio (floating-point scalar) and an optional input training_mode (boolean scalar). It produces two tensor outputs,
+output (floating-point tensor) and mask (optional `Tensor<bool>`). If `training_mode` is true then the output Y will be a random dropout;
+Note that this Dropout scales the masked input data by the following equation, so to convert the trained model into inference mode,
+the user can simply not pass `training_mode` input or set it to false.
+```
+output = scale * data * mask,
+```
+where
+```
+scale = 1. / (1. - ratio).
+```
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Dropout,
+    13,
+    OpSchema()
+        .SetDoc(GET_OP_DOC_STR(std::string(Dropout_ver13_doc) + GenerateOptionalArgumentsDoc()))
+        .Attr(
+            "seed",
+            "(Optional) Seed to the random generator, if not specified we will auto generate one.",
+            AttributeProto::INT,
+            OPTIONAL_VALUE)
+        .Input(0, "data", "The input data as Tensor.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Input(
+            1,
+            "ratio",
+            "The ratio of random dropout, with value in [0, 1). If this input was not set, "
+            "or if it was set to 0, the output would be a simple copy of the input. "
+            "If it's non-zero, output will be a random dropout of the scaled input, which is typically "
+            "the case during training. It is an optional value, if not specified it will default to 0.5.",
+            "T1",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            2,
+            "training_mode",
+            "If set to true then it indicates dropout is being used for training. It is an optional value hence unless "
+            "specified explicitly, it is false. If it is false, ratio is ignored and the operation mimics inference mode where "
+            "nothing will be dropped from the input data and if mask is requested as output it will contain all ones.",
+            "T2",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(0, "output", "The output.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(1, "mask", "The output mask.", "T2", OpSchema::Optional, true, 1, OpSchema::NonDifferentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
+            "Constrain input and output types to float tensors.")
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input 'ratio' types to float tensors.")
+        .TypeConstraint("T2", {"tensor(bool)"}, "Constrain output 'mask' types to boolean tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (hasInputShape(ctx, 0)) {
+            propagateShapeFromInputToOutput(ctx, 0, 0);
+          }
+
+          if (ctx.getNumInputs() > 1 && hasInputShape(ctx, 1)) {
+            auto& ratio_input_shape = getInputShape(ctx, 1);
+            if (static_cast<int>(ratio_input_shape.dim_size()) != 0) {
+              fail_shape_inference("Ratio of Dropout must be a scalar.");
+            }
+          }
+
+          if (ctx.getNumInputs() > 2 && hasInputShape(ctx, 2)) {
+            auto& training_mode_input_shape = getInputShape(ctx, 2);
+            if (static_cast<int>(training_mode_input_shape.dim_size()) != 0) {
+              fail_shape_inference("training_mode of Dropout must be a scalar.");
+            }
+          }
+
+          if (ctx.getNumOutputs() == 2) {
+            updateOutputElemType(ctx, 1, TensorProto::BOOL);
+            if (hasNInputShapes(ctx, 1)) {
+              propagateShapeFromInputToOutput(ctx, 0, 1);
+            }
+          }
+        }));
+
+static const char* LpNormalization_ver1_doc = R"DOC(
+Given a matrix, apply Lp-normalization along the provided axis.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    LpNormalization,
+    1,
+    OpSchema()
+        .Input(0, "input", "Input matrix", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "output", "Matrix after normalization", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .SetDoc(LpNormalization_ver1_doc)
+        .Attr(
+            "axis",
+            "The axis on which to apply normalization, -1 mean last axis.",
+            AttributeProto::INT,
+            static_cast<int64_t>(-1))
+        .Attr(
+            "p",
+            "The order of the normalization, only 1 or 2 are supported.",
+            AttributeProto::INT,
+            static_cast<int64_t>(2))
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { propagateShapeAndTypeFromFirstInput(ctx); }));
+
+static const char* InstanceNormalization_ver6_doc = R"DOC(
+Carries out instance normalization as described in the paper
+https://arxiv.org/abs/1607.08022.
+
+y = scale * (x - mean) / sqrt(variance + epsilon) + B,
+where mean and variance are computed per instance per channel.
+
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    InstanceNormalization,
+    6,
+    OpSchema()
+        .SetDoc(InstanceNormalization_ver6_doc)
+        .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
+        .Input(
+            0,
+            "input",
+            "Input data tensor from the previous operator; "
+            "dimensions for image case are (N x C x H x W), "
+            "where N is the batch size, C is the number of "
+            "channels, and H and W are the height and the "
+            "width of the data. For non image case, the "
+            "dimensions are in the form of "
+            "(N x C x D1 x D2 ... Dn), where N is the batch "
+            "size.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            1,
+            "scale",
+            "The input 1-dimensional scale tensor of size C.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            2,
+            "B",
+            "The input 1-dimensional bias tensor of size C.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "The output tensor of the same shape as input.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { propagateShapeAndTypeFromFirstInput(ctx); }));
+
+void maxUnpoolShapeInference_opset11(InferenceContext& ctx) {
+  // we need at least two inputs to have a shape for this inference.
+  if (ctx.getNumInputs() != 2 && ctx.getNumInputs() != 3) {
+    fail_type_inference("MaxUnpool op must have either two or three inputs.");
+  }
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+  if (!hasInputShape(ctx, 0)) {
+    return; // If first input does not have shape, we cannot infer much.
+  }
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  if (input_shape.dim_size() < 2) {
+    fail_shape_inference("Input tensor X must have at least 2 dimensions.");
+  }
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+
+  std::vector<int64_t> pads;
+  if (getRepeatedAttribute(ctx, "pads", pads)) {
+    if (pads.size() != n_input_dims * 2) {
+      fail_shape_inference("Attribute pads has incorrect size.");
+    }
+  } else {
+    pads.assign(n_input_dims * 2, 0);
+  }
+
+  std::vector<int64_t> strides;
+  if (getRepeatedAttribute(ctx, "strides", strides)) {
+    if (strides.size() != n_input_dims) {
+      fail_shape_inference("Attribute strides has incorrect size.");
+    }
+  } else {
+    strides.assign(n_input_dims, 1);
+  }
+
+  std::vector<int64_t> kernel_shape;
+  if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
+    if (kernel_shape.size() != n_input_dims) {
+      fail_shape_inference("Attribute kernel_shape has incorrect size.");
+    }
+  } else {
+    fail_shape_inference("Attribute kernel_shape must be specified.");
+  }
+
+  if (ctx.getNumInputs() == 3) {
+    // If the third input, output_size, is specified, then use that instead
+    // of inferring shape from inputs.
+    if (hasInputShape(ctx, 2)) {
+      auto& output_shape = getInputShape(ctx, 2);
+      if (output_shape.dim_size() != 1) {
+        fail_type_inference("'output_shape' must be rank 1 tensor.");
+      }
+      if (output_shape.dim(static_cast<int>(0)).has_dim_value() &&
+          static_cast<int>(output_shape.dim(static_cast<int>(0)).dim_value()) != input_shape.dim_size()) {
+        fail_shape_inference("'output_shape' must have same number of elements as the shape of input tensor X.");
+      }
+    }
+    return; // 'output_shape' is specified as input. Actual shape will be
+            // determined at runtime.
+  }
+
+  auto final_output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+  *final_output_shape->add_dim() = input_shape.dim(0);
+  *final_output_shape->add_dim() =
+      ctx.getInputType(1)->tensor_type().shape().dim(1); // channels should be the second dim of second input.
+
+  int kernel_shape_size = static_cast<int>(kernel_shape.size());
+  for (int i = 0; i < kernel_shape_size; ++i) {
+    auto newdim = final_output_shape->add_dim();
+    if (!input_shape.dim(2 + i).has_dim_value()) {
+      continue;
+    }
+
+    int64_t newdim_value = strides[i] * (input_shape.dim(2 + i).dim_value() - 1);
+    newdim_value += kernel_shape[i];
+    newdim_value -= pads[i];
+    newdim_value -= pads[i + kernel_shape_size];
+
+    // add in the initial position
+    newdim->set_dim_value(newdim_value);
+  }
+}
+
+// For GlobalPool operations.
+void globalPoolTypeShapeInference_opset1(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+  // needs at least one input with shape.
+  if (!hasNInputShapes(ctx, 1)) {
+    return;
+  }
+
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  if (input_shape.dim_size() < 2) {
+    return;
+  }
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+
+  // (N, C, 1, 1, ..., 1)
+  auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+  *output_shape->add_dim() = input_shape.dim(0);
+  *output_shape->add_dim() = input_shape.dim(1);
+
+  for (size_t i = 0; i < n_input_dims; ++i) {
+    output_shape->add_dim()->set_dim_value(1);
+  }
+}
+
+std::function<void(OpSchema&)> GlobalPoolingOpSchemaGenerator_opset1(const char* op_type, const char* op) {
+  return [=](OpSchema& schema) {
+    std::string doc;
+    POPULATE_OP_DOC_STR(doc = R"DOC(
+ Global{op_type} consumes an input tensor X and applies {op} pooling across
+ the values in the same channel. This is equivalent to {op_type} with kernel size
+ equal to the spatial dimension of input tensor.)DOC";
+                        ReplaceAll(doc, "{op_type}", op_type);
+                        ReplaceAll(doc, "{op}", op););
+    schema.SetDoc(doc);
+    schema.Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; "
+        "dimensions for image case are (N x C x H x W), "
+        "where N is the batch size, C is the number of "
+        "channels, and H and W are the height and the width "
+        "of the data. For non image case, the dimensions are "
+        "in the form of (N x C x D1 x D2 ... Dn), "
+        "where N is the batch size.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Output(
+        0,
+        "Y",
+        "Output data tensor from pooling across the input "
+        "tensor. The output tensor has the same rank as the input. "
+        "The first two dimensions of output shape are the same as "
+        "the input (N x C), while the other dimensions are all 1.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint(
+        "T",
+        {"tensor(float16)", "tensor(float)", "tensor(double)"},
+        "Constrain input and output types to float tensors.");
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { globalPoolTypeShapeInference_opset1(ctx); });
+  };
+}
+ONNX_OPERATOR_SET_SCHEMA(
+    GlobalAveragePool,
+    1,
+    OpSchema().FillUsing(GlobalPoolingOpSchemaGenerator_opset1("AveragePool", "average")));
+ONNX_OPERATOR_SET_SCHEMA(
+    GlobalMaxPool,
+    1,
+    OpSchema().FillUsing(GlobalPoolingOpSchemaGenerator_opset1("MaxPool", "max")));
+
+void convTransposeShapeInference_opset11(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+  // we need at least two inputs to have a shape for this inference.
+  if (!hasNInputShapes(ctx, 2)) {
+    return;
+  }
+
+  int64_t group = getAttribute(ctx, "group", 1);
+
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  if (input_shape.dim_size() < 2) {
+    return; // Input tensor should have at least two dimensions.
+  }
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+
+  std::vector<int64_t> dilations;
+  if (getRepeatedAttribute(ctx, "dilations", dilations)) {
+    if (dilations.size() != n_input_dims) {
+      return;
+    }
+  } else {
+    dilations.assign(n_input_dims, 1);
+  }
+
+  std::vector<int64_t> strides;
+  if (getRepeatedAttribute(ctx, "strides", strides)) {
+    if (strides.size() != n_input_dims) {
+      return;
+    }
+  } else {
+    strides.assign(n_input_dims, 1);
+  }
+
+  std::vector<int64_t> kernel_shape;
+  if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
+    if (kernel_shape.size() != n_input_dims) {
+      return;
+    }
+  } else {
+    auto second_input_shape = ctx.getInputType(1)->tensor_type().shape();
+    for (int i = 2; i < second_input_shape.dim_size(); ++i) {
+      if (!second_input_shape.dim(i).has_dim_value()) {
+        return;
+      }
+      kernel_shape.push_back(second_input_shape.dim(i).dim_value());
+    }
+  }
+
+  std::vector<int64_t> effective_kernel_shape = kernel_shape;
+  for (int i = 0; i < static_cast<int>(kernel_shape.size()); i++) {
+    // accounting for dilation, how big is the kernel in this dimension
+    effective_kernel_shape[i] = (effective_kernel_shape[i] - 1) * dilations[i] + 1;
+  }
+
+  std::vector<int64_t> pads;
+  if (getRepeatedAttribute(ctx, "pads", pads)) {
+    if (pads.size() != n_input_dims * 2) {
+      fail_shape_inference("Attribute pads has incorrect size");
+    }
+    const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
+    if (nullptr != auto_pad_attr && auto_pad_attr->s() != "NOTSET") {
+      fail_shape_inference("The pads attribute cannot be used simultaneously with auto_pad attribute");
+    }
+  } else {
+    pads.assign(n_input_dims * 2, 0);
+    const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
+    if ((nullptr != auto_pad_attr) && (auto_pad_attr->s() != "VALID")) {
+      int input_dims_size = static_cast<int>(n_input_dims);
+      for (int i = 0; i < input_dims_size; ++i) {
+        int64_t total_pad = effective_kernel_shape[i] - strides[i];
+        if (total_pad < 0)
+          total_pad = 0;
+        int64_t half_pad_small = total_pad >> 1;
+        int64_t half_pad_big = total_pad - half_pad_small;
+        if (auto_pad_attr->s() == "SAME_UPPER") {
+          pads[i] = half_pad_small;
+          pads[i + input_dims_size] = half_pad_big;
+        } else if (auto_pad_attr->s() == "SAME_LOWER") {
+          pads[i] = half_pad_big;
+          pads[i + input_dims_size] = half_pad_small;
+        }
+      }
+    }
+  }
+
+  std::vector<int64_t> output_shape;
+  bool output_shape_presented = true;
+  if (getRepeatedAttribute(ctx, "output_shape", output_shape)) {
+    if (output_shape.size() != n_input_dims) {
+      return;
+    }
+  } else {
+    output_shape_presented = false;
+  }
+
+  std::vector<int64_t> output_padding;
+  if (getRepeatedAttribute(ctx, "output_padding", output_padding)) {
+    if (output_padding.size() != n_input_dims) { // Added only to one side.
+      return;
+    }
+  } else {
+    output_padding.assign(n_input_dims, 0);
+  }
+
+  auto final_output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+  *final_output_shape->add_dim() = input_shape.dim(0);
+  *final_output_shape->add_dim() =
+      ctx.getInputType(1)->tensor_type().shape().dim(1) * group; // channels should be the second dim of second input
+                                                                 // multiply group.
+
+  int size_of_output;
+  if (output_shape_presented) {
+    size_of_output = static_cast<int>(output_shape.size());
+    for (int i = 0; i < size_of_output; ++i) {
+      if (input_shape.dim(i + 2).has_dim_value()) {
+        if (output_shape[i] < input_shape.dim(i + 2).dim_value()) {
+          // TODO: throw exception?
+          return; // output shape value cannot be smaller than the input shape
+                  // value
+        }
+      }
+      final_output_shape->add_dim()->set_dim_value(output_shape[i]);
+    }
+    return;
+  } else {
+    size_of_output = input_shape.dim_size() - 2;
+    for (int i = 0; i < size_of_output; ++i) {
+      if (input_shape.dim(i + 2).has_dim_value()) {
+        int64_t output_shape_dim = strides[i] * (input_shape.dim(i + 2).dim_value() - 1) + output_padding[i] +
+            effective_kernel_shape[i] - pads[i] - pads[i + n_input_dims];
+        final_output_shape->add_dim()->set_dim_value(output_shape_dim);
+      } else {
+        final_output_shape->add_dim();
+      }
+    }
+    return;
+  }
+}
+
+static const char* DeformConv_ver19_doc = R"DOC(
+Performs deformable convolution as described in https://arxiv.org/abs/1703.06211 and https://arxiv.org/abs/1811.11168.
+This operator specification supports the general N-D case. Note that most common use cases have 2D or 3D data.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    DeformConv,
+    19,
+    OpSchema()
+        .SetDoc(DeformConv_ver19_doc)
+        .Input(
+            0,
+            "X",
+            "Input data tensor. For 2D image data, it has shape (N, C, H, W) where N is the batch size, "
+            "C is the number of input channels, and H and W are the height and width. "
+            "In general, the shape is (N, C, D1, D2, ... , Dn) for n-dimensional data, where "
+            "D1 to Dn are the spatial dimension sizes. Most common use cases have n = 2 or 3.",
+            "T")
+        .Input(
+            1,
+            "W",
+            "Weight tensor that will be used in the convolutions. It has shape (oC, C/group, kH, kW), "
+            "where oC is the number of output channels and kH and kW are the kernel height and width. "
+            "For more than 2 dimensions, it has shape (oC, C/group, k1, k2, ... , kn).",
+            "T")
+        .Input(
+            2,
+            "offset",
+            "Offset tensor denoting the offset for the sampling locations in the convolution kernel. "
+            "It has shape (N, offset_group * kH * kW * 2, oH, oW) for 2D data or "
+            "(N, offset_group * k1 * k2 * ... * kn * n, o1, o2, ... , on) for nD data. Use linear interpolation"
+            "for fractional offset values. Sampling locations outside of the padded input tensor gives zero.",
+            "T")
+        .Input(
+            3,
+            "B",
+            "Optional 1D bias of length oC to be added to the convolution. Default is a tensor of zeros.",
+            "T",
+            OpSchema::Optional)
+        .Input(
+            4,
+            "mask",
+            "The mask tensor to be applied to each position in the convolution kernel. "
+            "It has shape (N, offset_group * kH * kW, oH, oW) for 2D data or "
+            "(N, offset_group * k1 * k2 * ... * kn * n, o1, o2, ... , on) for nD data. Default is a "
+            "tensor of ones.",
+            "T",
+            OpSchema::Optional)
+        .Output(
+            0,
+            "Y",
+            "Output data tensor that contains the result of convolution. It has shape (N, oC, oH, oW) "
+            "for 2D data or (N, oC, o1, o2, ..., on) for nD data",
+            "T")
+        .TypeConstraint(
+            "T",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .Attr(
+            "dilations",
+            "Dilation value along each spatial axis of the kernel. Default is 1 along each axis.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .Attr(
+            "group",
+            "Number of groups the input and output channels, C and oC, are divided into. C and oC must both "
+            "be divisible by group. Default is 1.",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
+        .Attr(
+            "kernel_shape",
+            "Shape of the convolution kernel. If not present, it is inferred from the shape of input W.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .Attr(
+            "offset_group",
+            "Number of groups of offset. C must be divisible by offset_group. Default is 1.",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
+        .Attr(
+            "pads",
+            "Padding for the beginning and end along each spatial axis. The values represent the number of pixels "
+            "added to the beginning and end of the corresponding axis and can take any nonnegative value. "
+            "The format should be as follows: [x1_begin, x2_begin, ..., x1_end, x2_end, ...], where xi_begin "
+            "is the number of pixels added at the beginning of axis `i` and xi_end is the number of pixels "
+            "added at the end of axis `i`. Default is 0 along each axis.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .Attr(
+            "strides",
+            "Stride along each spatial axis. Default is 1 along each axis.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          convPoolShapeInference_opset19(ctx, true, false, 0, 1);
+        }));
+
+std::function<void(OpSchema&)> ConvTransposeOpSchemaGenerator_opset11(const char* filter_desc) {
+  return [=](OpSchema& schema) {
+    std::string doc;
+    POPULATE_OP_DOC_STR(doc = R"DOC(
+The convolution transpose operator consumes an input tensor and {filter_desc},
+and computes the output.
+
+If the pads parameter is provided the shape of the output is calculated via the following equation:
+
+  output_shape[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - pads[start_i] - pads[end_i]
+
+output_shape can also be explicitly specified in which case pads values are auto generated using these equations:
+
+  total_padding[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]
+  If (auto_pads == SAME_UPPER): pads[start_i] = total_padding[i]/2; pads[end_i] = total_padding[i] - (total_padding[i]/2)
+  Else: pads[start_i] = total_padding[i] - (total_padding[i]/2); pads[end_i] = (total_padding[i]/2).
+
+    )DOC";
+                        ReplaceAll(doc, "{filter_desc}", filter_desc););
+    schema.SetDoc(doc);
+    schema.Input(
+        0,
+        "X",
+        "Input data tensor from previous layer; has size (N x C x H x W)"
+        ", where N is the batch size, C is the number of channels, and"
+        " H and W are the height and width. Note that this is for the 2D image. "
+        "Otherwise the size is (N x C x D1 x D2 ... x Dn)",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Input(
+        1,
+        "W",
+        "The weight tensor that will be used in the "
+        "convolutions; has size (C x M/group x kH x kW), where C "
+        "is the number of channels, and kH and kW are the "
+        "height and width of the kernel, and M is the number "
+        "of feature maps. For more than 2 dimensions, the "
+        "weight shape will be (C x M/group x k1 x k2 x ... x kn), "
+        "where (k1 x k2 x ... x kn) is the dimension of the kernel. "
+        "The number of channels in the output should be equal to W.shape[1] * group "
+        "(assuming zero based indices of the shape array)",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Input(
+        2,
+        "B",
+        "Optional 1D bias to be added to the convolution, has size of M.",
+        "T",
+        OpSchema::Optional,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Output(
+        0,
+        "Y",
+        "Output data tensor that contains the result of the convolution. The "
+        "output dimensions are functions of the kernel size, stride size, "
+        "pad lengths and group count. "
+        "The number of channels in the output should be equal to W.shape[1] * group "
+        "(assuming zero based indices of the shape array)",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint(
+        "T",
+        {"tensor(float16)", "tensor(float)", "tensor(double)"},
+        "Constrain input and output types to float tensors.");
+    schema.Attr(
+        "kernel_shape",
+        "The shape of the convolution kernel. If not present, should be inferred from input W.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "output_shape",
+        "The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified "
+        "pads values are ignored. See doc for details for equations to generate pads. Note that the output_shape attribute value "
+        "should not include dimensions for batch size and channels, which are automatically inferred.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "output_padding",
+        "Additional elements added to the side with higher coordinate indices in the output. "
+        "Each padding value in \"output_padding\" must be less than the corresponding stride/dilation dimension. "
+        "By default, this attribute is a zero vector. "
+        "Note that this attribute doesn't directly affect the computed output values. "
+        "It only controls the selection of the computed values, "
+        "so changing this attribute only adds or removes output elements. "
+        "If \"output_shape\" is explicitly provided, "
+        "\"output_padding\" does not contribute additional size to \"output_shape\" but "
+        "participates in the computation of the needed padding amount. "
+        "This is also called adjs or adjustment in some frameworks.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "dilations",
+        "dilation value along each spatial axis of the filter. If not present, the dilation defaults to 1 along each spatial axis.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "strides",
+        "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr("auto_pad", conv_transpose_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
+    schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
+    schema.Attr(
+        "group",
+        "number of groups input channels and output channels are divided into.",
+        AttributeProto::INT,
+        static_cast<int64_t>(1));
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { convTransposeShapeInference_opset11(ctx); });
+  };
+}
+
+ONNX_OPERATOR_SET_SCHEMA(ConvTranspose, 11, OpSchema().FillUsing(ConvTransposeOpSchemaGenerator_opset11("a filter")));
+
+std::function<void(OpSchema&)> ConvOpSchemaGenerator_opset11(const char* filter_desc) {
+  return [=](OpSchema& schema) {
+    std::string doc;
+    POPULATE_OP_DOC_STR(doc = R"DOC(
+The convolution operator consumes an input tensor and {filter_desc}, and
+computes the output.)DOC";
+                        ReplaceAll(doc, "{filter_desc}", filter_desc););
+    schema.SetDoc(doc);
+    schema.Input(
+        0,
+        "X",
+        "Input data tensor from previous layer; "
+        "has size (N x C x H x W), where N is the batch size, "
+        "C is the number of channels, and H and W are the "
+        "height and width. Note that this is for the 2D image. "
+        "Otherwise the size is (N x C x D1 x D2 ... x Dn). "
+        "Optionally, if dimension denotation is "
+        "in effect, the operation expects input data tensor "
+        "to arrive with the dimension denotation of [DATA_BATCH, "
+        "DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Input(
+        1,
+        "W",
+        "The weight tensor that will be used in the "
+        "convolutions; has size (M x C/group x kH x kW), where C "
+        "is the number of channels, and kH and kW are the "
+        "height and width of the kernel, and M is the number "
+        "of feature maps. For more than 2 dimensions, the "
+        "kernel shape will be (M x C/group x k1 x k2 x ... x kn), "
+        "where (k1 x k2 x ... kn) is the dimension of the kernel. "
+        "Optionally, if dimension denotation is in effect, "
+        "the operation expects the weight tensor to arrive "
+        "with the dimension denotation of [FILTER_OUT_CHANNEL, "
+        "FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. "
+        "Assuming zero based indices for the shape array, "
+        "X.shape[1] == (W.shape[1] * group) == C and "
+        "W.shape[0] mod G == 0. Or in other words "
+        "FILTER_IN_CHANNEL multiplied by the number of groups "
+        "should be equal to DATA_CHANNEL and the number of "
+        "feature maps M should be a multiple of the number of "
+        "groups G.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Input(
+        2,
+        "B",
+        "Optional 1D bias to be added to the convolution, has size of M.",
+        "T",
+        OpSchema::Optional,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Output(
+        0,
+        "Y",
+        "Output data tensor that contains the result of the "
+        "convolution. The output dimensions are functions "
+        "of the kernel size, stride size, and pad lengths.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint(
+        "T",
+        {"tensor(float16)", "tensor(float16)", "tensor(float)", "tensor(double)"},
+        "Constrain input and output types to float tensors.");
+    schema.Attr(
+        "kernel_shape",
+        "The shape of the convolution kernel. If not present, should be inferred from input W.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "dilations",
+        "dilation value along each spatial axis of the filter. If not present, the dilation defaults is 1 along each spatial axis.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "strides",
+        "Stride along each spatial axis. If not present, the stride defaults is 1 along each spatial axis.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr("auto_pad", conv_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
+    schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
+    schema.Attr(
+        "group",
+        "number of groups input channels and output channels are divided into.",
+        AttributeProto::INT,
+        static_cast<int64_t>(1));
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+      propagateElemTypeFromInputToOutput(ctx, 0, 0);
+      convPoolShapeInference_opset19(ctx, true, false, 0, 1);
+    });
+  };
+}
+
+ONNX_OPERATOR_SET_SCHEMA(Conv, 11, OpSchema().FillUsing(ConvOpSchemaGenerator_opset11("a filter")));
+
+void roiPoolTypeShapeInference_opset1(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+  // rois is the second input.
+  if (!hasNInputShapes(ctx, 2)) {
+    return;
+  }
+
+  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+  auto rios_shape = ctx.getInputType(1)->tensor_type().shape();
+
+  if (input_shape.dim_size() < 2) {
+    fail_shape_inference("Input tensor must have at least 2 dimensions");
+  }
+  if (rios_shape.dim_size() != 2) {
+    fail_shape_inference("RoIs tensor must have 2 dimensions");
+  }
+
+  // first dim is the batch axis and the next is the number of channels.
+  size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
+
+  std::vector<int64_t> pooled_shape;
+  if (getRepeatedAttribute(ctx, "pooled_shape", pooled_shape)) {
+    if (pooled_shape.size() != n_input_dims) {
+      fail_shape_inference("Attribute pooled_shape has incorrect length");
+    }
+  } else {
+    fail_shape_inference("Attribute pooled_shape must be specified");
+  }
+
+  // (num_rois, channels, pooled_shape[0], pooled_shape[1])
+  auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+  *output_shape->add_dim() = rios_shape.dim(0);
+  *output_shape->add_dim() = input_shape.dim(1);
+  output_shape->add_dim()->set_dim_value(pooled_shape[0]);
+  output_shape->add_dim()->set_dim_value(pooled_shape[1]);
+}
+
+std::function<void(OpSchema&)> RoiPoolOpSchemaGenerator_opset1(const char* name) {
+  return [=](OpSchema& schema) {
+    std::string doc;
+    POPULATE_OP_DOC_STR(doc = R"DOC(
+ ROI {name} pool consumes an input tensor X and region of interests (RoIs) to
+ apply {name} pooling across each RoI, to produce output 4-D tensor of shape
+ (num_rois, channels, pooled_shape[0], pooled_shape[1]).)DOC";
+                        ReplaceAll(doc, "{name}", name););
+    schema.SetDoc(doc);
+    schema.Attr("pooled_shape", "ROI pool output shape (height, width).", AttributeProto::INTS);
+    schema.Attr(
+        "spatial_scale",
+        "Multiplicative spatial scale factor to translate ROI coordinates from their input scale to the scale used when pooling.",
+        AttributeProto::FLOAT,
+        1.f);
+    schema.Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; "
+        "dimensions for image case are (N x C x H x W), "
+        "where N is the batch size, C is the number of "
+        "channels, and H and W are the height and the "
+        "width of the data.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Input(
+        1,
+        "rois",
+        "RoIs (Regions of Interest) to pool over. Should "
+        "be a 2-D tensor of shape (num_rois, 5) given as "
+        "[[batch_id, x1, y1, x2, y2], ...].",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::NonDifferentiable);
+    schema.Output(
+        0,
+        "Y",
+        "RoI pooled output 4-D tensor of shape (num_rois, channels, pooled_shape[0], pooled_shape[1]).",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint(
+        "T",
+        {"tensor(float16)", "tensor(float)", "tensor(double)"},
+        "Constrain input and output types to float tensors.");
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { roiPoolTypeShapeInference_opset1(ctx); });
+  };
+}
+
+ONNX_OPERATOR_SET_SCHEMA(MaxRoiPool, 1, OpSchema().FillUsing(RoiPoolOpSchemaGenerator_opset1("max")));
+
+std::function<void(OpSchema&)> LpPoolOpSchemaGenerator_opset18(const char* name) {
+  return [=](OpSchema& schema) {
+    std::string doc;
+    POPULATE_OP_DOC_STR(doc = R"DOC(
+ {name} consumes an input tensor X and applies Lp pooling across
+ the tensor according to kernel sizes, stride sizes, and pad lengths.
+ Lp pooling consisting of computing the Lp norm on all values of a subset
+ of the input tensor according to the kernel size and downsampling the
+ data into the output tensor Y for further processing. The output spatial shape will be following:
+ ```
+ output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
+ ```
+ or
+ ```
+ output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
+ ```
+ if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
+
+ `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+ ```
+ VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
+ SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+ ```
+ And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+ ```
+ pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + {kernelSpatialShape} - input_spatial_shape[i]
+ ```)DOC";
+                        ReplaceAll(doc, "{name}", name););
+    schema.SetDoc(doc);
+    schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
+    schema.Attr(
+        "strides",
+        "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "dilations",
+        "dilation value along each spatial axis of the filter. If not present, the dilation defaults is 1 along each spatial axis.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr("auto_pad", conv_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
+    schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
+    schema.Attr(
+        "p", "p value of the Lp norm used to pool over the input data.", AttributeProto::INT, static_cast<int64_t>(2));
+    schema.Attr(
+        "ceil_mode",
+        "Whether to use ceil or floor (default) to compute the output shape.",
+        AttributeProto::INT,
+        static_cast<int64_t>(0));
+    schema.Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; "
+        "dimensions for image case are (N x C x H x W), "
+        "where N is the batch size, C is the number of "
+        "channels, and H and W are the height and the "
+        "width of the data. For non image case, the "
+        "dimensions are in the form of "
+        "(N x C x D1 x D2 ... Dn), where N is the "
+        "batch size.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Output(
+        0,
+        "Y",
+        "Output data tensor from Lp pooling across the input "
+        "tensor. Dimensions will vary based on various kernel, stride, and pad "
+        "sizes.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint(
+        "T",
+        {"tensor(float16)", "tensor(float)", "tensor(double)"},
+        "Constrain input and output types to float tensors.");
+    schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+      propagateElemTypeFromInputToOutput(ctx, 0, 0);
+      convPoolShapeInference_opset19(ctx, true, true, 0, 1);
+    });
+  };
+}
+
+ONNX_OPERATOR_SET_SCHEMA(LpPool, 18, OpSchema().FillUsing(LpPoolOpSchemaGenerator_opset18("LpPool")));
+
+static const char* MaxUnpool_ver11_doc = R"DOC(
+MaxUnpool essentially computes the partial inverse of the MaxPool op.
+ The input information to this op is typically the output information from a MaxPool op. The first
+ input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
+ from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
+ to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
+ The third (optional) input is a tensor that specifies the output size of the unpooling operation.
+
+MaxUnpool is intended to do 'partial' inverse of the MaxPool op. 'Partial' because all the non-maximal
+ values from the original input to MaxPool are set to zero in the output of the MaxUnpool op. Pooling
+ the result of an unpooling operation should give back the original input to the unpooling op.
+
+MaxUnpool can produce the same output size for several input sizes, which makes unpooling op ambiguous.
+ The third input argument, output_size, is meant to disambiguate the op and produce output tensor of
+ known/predictable size.
+
+In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
+ which define the exact unpooling op. The attributes typically have the same values as the corresponding
+ pooling op that the unpooling op is trying to invert.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    MaxUnpool,
+    11,
+    OpSchema()
+        .SetDoc(MaxUnpool_ver11_doc)
+        .Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS)
+        .Attr(
+            "strides",
+            "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE)
+        .Input(
+            0,
+            "X",
+            "Input data tensor that has to be unpooled. "
+            "This tensor is typically the first output of the MaxPool op."
+            "Dimensions for image case are (N x C x H x W), "
+            "where N is the batch size, C is the number of "
+            "channels, and H and W are the height and the "
+            "width of the data. For non-image case, the "
+            "dimensions are in the form of "
+            "(N x C x D1 x D2 ... Dn), where N is the batch "
+            "size. Optionally, if dimension denotation is "
+            "in effect, the operation expects the input "
+            "data tensor to arrive with the dimension denotation "
+            "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+            "T1",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            1,
+            "I",
+            "Input data tensor containing the indices corresponding to "
+            "elements in the first input tensor X."
+            "This tensor is typically the second output of the MaxPool op."
+            "Dimensions must be the same as input tensor X. "
+            "The indices are linear, i.e. computed considering the tensor as flattened 1-D tensor, "
+            "assuming row-major storage. Also, the linear indices should not consider padding. "
+            "So the values in indices are in the range [0, N x C x D1 x ... x Dn).",
+            "T2",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            2,
+            "output_shape",
+            "The shape of the output can be explicitly set which will cause pads values to be auto generated. If 'output_shape' is specified, "
+            "'pads' values are ignored.",
+            "T2",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(
+            0,
+            "output",
+            "Output data tensor that contains the result of the unpooling.",
+            "T1",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain input and output types to float tensors.")
+        .TypeConstraint("T2", {"tensor(int64)"}, "Constrain index tensor to int64")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { maxUnpoolShapeInference_opset11(ctx); }));
+
+std::vector<std::string> GetSupportedDataTypesForPoolingOps_opset19(bool supports8bit) {
+  if (supports8bit) {
+    return {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(int8)", "tensor(uint8)"};
+  }
+  return {"tensor(float16)", "tensor(float)", "tensor(double)"};
+}
+
+std::function<void(OpSchema&)> PoolOpSchemaGenerator_opset19(
+    const char* name,
+    const char* opName,
+    const char* additionalDescription,
+    bool use_dilation,
+    bool supports8bit = false) {
+  return [=](OpSchema& schema) {
+    std::string doc;
+    POPULATE_OP_DOC_STR(
+        doc = R"DOC(
+ {name} consumes an input tensor X and applies {opName} pooling across
+ the tensor according to kernel sizes, stride sizes, and pad lengths.
+ {opName} pooling consisting of computing the {opName} on all values of a
+ subset of the input tensor according to the kernel size and downsampling the
+ data into the output tensor Y for further processing. The output spatial shape is calculated differently
+ depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
+ With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+ ```
+ output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+ ```
+ or
+ ```
+ output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
+ ```
+ if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
+
+ `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
+ ```
+ VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
+ SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
+ ```
+ or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
+ ```
+ VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i]) + 1
+ SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
+ ```
+ And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
+ ```
+ pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + {kernelSpatialShape} - input_spatial_shape[i]
+ ```
+ {additionalDescription}
+ )DOC";
+        ReplaceAll(doc, "{name}", name);
+        ReplaceAll(doc, "{opName}", opName);
+        ReplaceAll(doc, "{additionalDescription}", additionalDescription);
+        ReplaceAll(
+            doc,
+            "{kernelSpatialShape}",
+            use_dilation ? "((kernel_spatial_shape[i] - 1) * dilations[i] + 1)" : "kernel_spatial_shape[i]"););
+    schema.SetDoc(doc);
+    schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
+    schema.Attr(
+        "strides",
+        "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+        AttributeProto::INTS,
+        OPTIONAL_VALUE);
+    schema.Attr("auto_pad", conv_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
+    schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
+    schema.Attr(
+        "ceil_mode",
+        "Whether to use ceil or floor (default) to compute the output shape.",
+        AttributeProto::INT,
+        static_cast<int64_t>(0));
+    schema.Input(
+        0,
+        "X",
+        "Input data tensor from the previous operator; "
+        "dimensions for image case are (N x C x H x W), "
+        "where N is the batch size, C is the number of "
+        "channels, and H and W are the height and the "
+        "width of the data. For non image case, the "
+        "dimensions are in the form of "
+        "(N x C x D1 x D2 ... Dn), where N is the batch "
+        "size. Optionally, if dimension denotation is "
+        "in effect, the operation expects the input "
+        "data tensor to arrive with the dimension denotation "
+        "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Output(
+        0,
+        "Y",
+        "Output data tensor from average or max pooling across "
+        "the input tensor. Dimensions will vary based "
+        "on various kernel, stride, and pad sizes. Floor value of "
+        "the dimension is used",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint(
+        "T",
+        GetSupportedDataTypesForPoolingOps_opset19(supports8bit),
+        supports8bit ? "Constrain input and output types to float and 8 bit tensors."
+                     : "Constrain input and output types to float tensors.");
+    schema.TypeAndShapeInferenceFunction([use_dilation](InferenceContext& ctx) {
+      propagateElemTypeFromInputToOutput(ctx, 0, 0);
+      if (ctx.getNumOutputs() > 1) {
+        // MaxPool with two outputs case.
+        auto output_type = ctx.getOutputType(1);
+        if (output_type->value_case() == TypeProto::kTensorType ||
+            output_type->value_case() == TypeProto::VALUE_NOT_SET) {
+          output_type->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
+        }
+      }
+      convPoolShapeInference_opset19(ctx, use_dilation, true, 0, 1);
+    });
+  };
+}
+
+ONNX_OPERATOR_SET_SCHEMA(
+    AveragePool,
+    19,
+    OpSchema()
+        .FillUsing(PoolOpSchemaGenerator_opset19(
+            "AveragePool",
+            "average",
+            "The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).",
+            true, /* use_dilation: dilations attribute has been added in opset 19. */
+            false /* supports8bit: does not support 8bit. */))
+        .Attr(
+            "dilations",
+            "Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .Attr(
+            "count_include_pad",
+            "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0)));
+
+ONNX_OPERATOR_SET_SCHEMA(
+    MaxPool,
+    12,
+    OpSchema()
+        .FillUsing(PoolOpSchemaGenerator_opset19(
+            "MaxPool",
+            "max",
+            "The output of each pooling window is maximum number of elements exclude pad. ",
+            true,
+            true))
+        .Attr(
+            "storage_order",
+            "The storage order of the tensor. 0 is row major, and 1 is column major. "
+            "This attribute is used only to convert an n-tuple index value into "
+            "a single integer value for producing the second output. ",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Attr(
+            "dilations",
+            "Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .Output(
+            1,
+            "Indices",
+            "Indices tensor from max pooling across the input tensor. "
+            "The dimensions of indices are the same as output tensor. "
+            "The values in indices of are the indices of the selected values during pooling. "
+            "The indices are computed as flatten 1-D tensor, "
+            "and the indices do not consider padding. "
+            "So the values in indices are in [0, N x C x D1 x ... x Dn).",
+            "I",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .TypeConstraint("I", {"tensor(int64)"}, "Constrain index tensor to int64"));
+
 static const char* Dropout_ver12_doc = R"DOC(
 Dropout takes an input floating-point tensor, an optional input ratio (floating-point scalar) and an optional input training_mode (boolean scalar). It produces two tensor outputs,
 output (floating-point tensor) and mask (optional `Tensor<bool>`). If `training_mode` is true then the output Y will be a random dropout;
@@ -956,8 +2487,8 @@ void maxUnpoolShapeInference1(InferenceContext& ctx) {
       if (output_shape.dim_size() != 1) {
         fail_type_inference("'output_shape' must be rank 1 tensor.");
       }
-      if (output_shape.dim((int)0).has_dim_value() &&
-          static_cast<int>(output_shape.dim((int)0).dim_value()) != input_shape.dim_size()) {
+      if (output_shape.dim(static_cast<int>(0)).has_dim_value() &&
+          static_cast<int>(output_shape.dim(static_cast<int>(0)).dim_value()) != input_shape.dim_size()) {
         fail_shape_inference("'output_shape' must have same number of elements as the shape of input tensor X.");
       }
     }
diff --git a/onnx/defs/object_detection/defs.cc b/onnx/defs/object_detection/defs.cc
index 82bdf15440b..6a0ea98b832 100644
--- a/onnx/defs/object_detection/defs.cc
+++ b/onnx/defs/object_detection/defs.cc
@@ -7,7 +7,7 @@ using namespace ONNX_NAMESPACE;
 
 namespace ONNX_NAMESPACE {
 
-static const char* RoiAlign_ver16_doc = R"DOC(
+static const char* RoiAlign_ver22_doc = R"DOC(
 Region of Interest (RoI) align operation described in the
 [Mask R-CNN paper](https://arxiv.org/abs/1703.06870).
 RoiAlign consumes an input tensor X and region of interests (rois)
@@ -23,9 +23,9 @@ through bilinear interpolation.
 
 ONNX_OPERATOR_SET_SCHEMA(
     RoiAlign,
-    16,
+    22,
     OpSchema()
-        .SetDoc(RoiAlign_ver16_doc)
+        .SetDoc(RoiAlign_ver22_doc)
         .Attr(
             "spatial_scale",
             "Multiplicative spatial scale factor to translate ROI coordinates "
@@ -89,10 +89,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "(num_rois, C, output_height, output_width). The r-th batch element Y[r-1] "
             "is a pooled feature map corresponding to the r-th RoI X[r-1].",
             "T1")
-        .TypeConstraint(
-            "T1",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain types to float tensors.")
+        .TypeConstraint("T1", OpSchema::all_float_types_ir4(), "Constrain types to float tensors.")
         .TypeConstraint("T2", {"tensor(int64)"}, "Constrain types to int tensors.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
diff --git a/onnx/defs/object_detection/old.cc b/onnx/defs/object_detection/old.cc
index 28ae0c5009f..65659bb35db 100644
--- a/onnx/defs/object_detection/old.cc
+++ b/onnx/defs/object_detection/old.cc
@@ -7,6 +7,123 @@ using namespace ONNX_NAMESPACE;
 
 namespace ONNX_NAMESPACE {
 
+static const char* RoiAlign_ver16_doc = R"DOC(
+Region of Interest (RoI) align operation described in the
+[Mask R-CNN paper](https://arxiv.org/abs/1703.06870).
+RoiAlign consumes an input tensor X and region of interests (rois)
+to apply pooling across each RoI; it produces a 4-D tensor of shape
+(num_rois, C, output_height, output_width).
+
+RoiAlign is proposed to avoid the misalignment by removing
+quantizations while converting from original image into feature
+map and from feature map into RoI feature; in each ROI bin,
+the value of the sampled locations are computed directly
+through bilinear interpolation.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    RoiAlign,
+    16,
+    OpSchema()
+        .SetDoc(RoiAlign_ver16_doc)
+        .Attr(
+            "spatial_scale",
+            "Multiplicative spatial scale factor to translate ROI coordinates "
+            "from their input spatial scale to the scale used when pooling, "
+            "i.e., spatial scale of the input feature map X relative to the "
+            "input image. E.g.; default is 1.0f. ",
+            AttributeProto::FLOAT,
+            1.f)
+        .Attr("output_height", "default 1; Pooled output Y's height.", AttributeProto::INT, static_cast<int64_t>(1))
+        .Attr("output_width", "default 1; Pooled output Y's width.", AttributeProto::INT, static_cast<int64_t>(1))
+        .Attr(
+            "sampling_ratio",
+            "Number of sampling points in the interpolation grid used to compute "
+            "the output value of each pooled output bin. If > 0, then exactly "
+            "sampling_ratio x sampling_ratio grid points are used. If == 0, then "
+            "an adaptive number of grid points are used (computed as "
+            "ceil(roi_width / output_width), and likewise for height). Default is 0.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Attr(
+            "mode",
+            "The pooling method. Two modes are supported: 'avg' and 'max'. "
+            "Default is 'avg'.",
+            AttributeProto::STRING,
+            std::string("avg"))
+        .Attr(
+            "coordinate_transformation_mode",
+            "Allowed values are 'half_pixel' and 'output_half_pixel'. "
+            "Use the value 'half_pixel' to pixel shift the input coordinates by -0.5 (the recommended behavior). "
+            "Use the value 'output_half_pixel' to omit the pixel shift for the input (use this for a "
+            "backward-compatible behavior).",
+            AttributeProto::STRING,
+            std::string("half_pixel"))
+        .Input(
+            0,
+            "X",
+            "Input data tensor from the previous operator; "
+            "4-D feature map of shape (N, C, H, W), "
+            "where N is the batch size, C is the number of channels, "
+            "and H and W are the height and the width of the data.",
+            "T1")
+        .Input(
+            1,
+            "rois",
+            "RoIs (Regions of Interest) to pool over; rois is "
+            "2-D input of shape (num_rois, 4) given as "
+            "[[x1, y1, x2, y2], ...]. "
+            "The RoIs' coordinates are in the coordinate system of the input image. "
+            "Each coordinate set has a 1:1 correspondence with the 'batch_indices' input.",
+            "T1")
+        .Input(
+            2,
+            "batch_indices",
+            "1-D tensor of shape (num_rois,) with each element denoting "
+            "the index of the corresponding image in the batch.",
+            "T2")
+        .Output(
+            0,
+            "Y",
+            "RoI pooled output, 4-D tensor of shape "
+            "(num_rois, C, output_height, output_width). The r-th batch element Y[r-1] "
+            "is a pooled feature map corresponding to the r-th RoI X[r-1].",
+            "T1")
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain types to float tensors.")
+        .TypeConstraint("T2", {"tensor(int64)"}, "Constrain types to int tensors.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+          size_t input_param = 0, rois_param = 1, batch_index_param = 2;
+
+          checkInputRank(ctx, input_param, 4);
+          checkInputRank(ctx, rois_param, 2);
+          checkInputRank(ctx, batch_index_param, 1);
+
+          // Output dimensions, initialized to an unknown-dimension-value
+          Dim num_rois, C, ht, width;
+
+          // Get value of C from dim 1 of input_param, if available
+          unifyInputDim(ctx, input_param, 1, C);
+
+          // Get value of num_rois from dim 0 of rois_param, if available
+          unifyInputDim(ctx, rois_param, 0, num_rois);
+          // ... or from dim 0 of batch_index_param, if available
+          unifyInputDim(ctx, batch_index_param, 0, num_rois);
+
+          // Get height from attribute, using default-value of 1
+          unifyDim(ht, getAttribute(ctx, "output_height", 1));
+
+          // Get width from attribute, using default-value of 1
+          unifyDim(width, getAttribute(ctx, "output_width", 1));
+
+          // set output shape:
+          updateOutputShape(ctx, 0, {num_rois, C, ht, width});
+        }));
+
 static const char* RoiAlign_ver10_doc = R"DOC(
 Region of Interest (RoI) align operation described in the
 [Mask R-CNN paper](https://arxiv.org/abs/1703.06870).
diff --git a/onnx/defs/operator_sets.h b/onnx/defs/operator_sets.h
index 552ddce622d..ad2791524e7 100644
--- a/onnx/defs/operator_sets.h
+++ b/onnx/defs/operator_sets.h
@@ -1186,13 +1186,107 @@ class OpSet_Onnx_ver21 {
 };
 
 // Forward declarations for ai.onnx version 22
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, EyeLike);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomUniform);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomNormal);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomUniformLike);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomNormalLike);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Multinomial);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Bernoulli);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, ThresholdedRelu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Selu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Elu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Mish);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, HardSigmoid);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, HardSwish);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Softsign);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Softplus);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Sin);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Cos);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Tan);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Asin);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Acos);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Atan);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Sinh);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Cosh);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Asinh);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Acosh);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Atanh);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Round);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Det);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, NegativeLogLikelihoodLoss);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, AveragePool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, MaxPool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, MaxUnpool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, LpPool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, MaxRoiPool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Conv);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, ConvTranspose);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, DeformConv);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GlobalAveragePool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GlobalMaxPool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GlobalLpPool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, InstanceNormalization);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, LpNormalization);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Dropout);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RoiAlign);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RNN);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GRU);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, LSTM);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GridSample);
 
 // Iterate over schema from ai.onnx version 22
 class OpSet_Onnx_ver22 {
  public:
   static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
-    // TODO: Remove after introducing the first schema to opset 22
-    (void)fn;
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, EyeLike)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomUniform)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomNormal)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomUniformLike)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RandomNormalLike)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Multinomial)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Bernoulli)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, ThresholdedRelu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Selu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Elu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Mish)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, HardSigmoid)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, HardSwish)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Softsign)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Softplus)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Sin)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Cos)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Tan)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Asin)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Acos)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Atan)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Sinh)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Cosh)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Asinh)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Acosh)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Atanh)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Round)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Det)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, NegativeLogLikelihoodLoss)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, AveragePool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, MaxPool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, MaxUnpool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, LpPool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, MaxRoiPool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Conv)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, ConvTranspose)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, DeformConv)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GlobalAveragePool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GlobalMaxPool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GlobalLpPool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, InstanceNormalization)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, LpNormalization)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, Dropout)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RoiAlign)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, RNN)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GRU)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, LSTM)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 22, GridSample)>());
   }
 };
 
@@ -1227,7 +1321,7 @@ inline void RegisterOnnxOperatorSetSchema(int target_version, bool fail_duplicat
   // Update here if opset_version bumps
   // These calls for schema registration here are required to be in descending order for this to work correctly
   //
-  // Version-sepcific registration sees duplicate schema version request as error if fail_duplicate_schema
+  // Version-specific registration sees duplicate schema version request as error if fail_duplicate_schema
   RegisterOpSetSchema<OpSet_Onnx_ver22>(target_version, fail_duplicate_schema);
   RegisterOpSetSchema<OpSet_Onnx_ver21>(target_version, fail_duplicate_schema);
   RegisterOpSetSchema<OpSet_Onnx_ver20>(target_version, fail_duplicate_schema);
diff --git a/onnx/defs/rnn/defs.cc b/onnx/defs/rnn/defs.cc
index b5844444ca8..a28f2354b02 100644
--- a/onnx/defs/rnn/defs.cc
+++ b/onnx/defs/rnn/defs.cc
@@ -167,16 +167,13 @@ std::function<void(OpSchema&)> RNNDocGenerator(const char* /*name*/) {
         true,
         1,
         OpSchema::Differentiable);
-    schema.TypeConstraint(
-        "T",
-        {"tensor(float16)", "tensor(float)", "tensor(double)"},
-        "Constrain input and output types to float tensors.");
+    schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
     schema.TypeConstraint("T1", {"tensor(int32)"}, "Constrain seq_lens to integer tensor.");
     schema.TypeAndShapeInferenceFunction(RNNShapeInference);
   };
 }
 
-static const char* RNN_ver14_doc = R"DOC(
+static const char* RNN_ver22_doc = R"DOC(
 Computes an one-layer simple RNN. This operator is usually supported
 via some custom implementation such as CuDNN.
 
@@ -220,9 +217,9 @@ Equations (Default: f=Tanh):
 
 ONNX_OPERATOR_SET_SCHEMA(
     RNN,
-    14,
+    22,
     OpSchema()
-        .SetDoc(GET_OP_DOC_STR(std::string(RNN_ver14_doc) + GenerateOptionalArgumentsDoc()))
+        .SetDoc(GET_OP_DOC_STR(std::string(RNN_ver22_doc) + GenerateOptionalArgumentsDoc()))
         .Attr(
             "activations",
             "One (or two if bidirectional) activation function for "
@@ -266,7 +263,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             OpSchema::Differentiable)
         .FillUsing(RNNDocGenerator("RNN")));
 
-static const char* GRU_ver14_doc = R"DOC(
+static const char* GRU_ver22_doc = R"DOC(
 Computes an one-layer GRU. This operator is usually supported via some custom
 implementation such as CuDNN.
 
@@ -317,9 +314,9 @@ Equations (Default: f=Sigmoid, g=Tanh):
 
 ONNX_OPERATOR_SET_SCHEMA(
     GRU,
-    14,
+    22,
     OpSchema()
-        .SetDoc(GET_OP_DOC_STR(std::string(GRU_ver14_doc) + GenerateOptionalArgumentsDoc()))
+        .SetDoc(GET_OP_DOC_STR(std::string(GRU_ver22_doc) + GenerateOptionalArgumentsDoc()))
         .Attr(
             "activations",
             "A list of 2 (or 4 if bidirectional) activation functions "
@@ -371,7 +368,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             OpSchema::Differentiable)
         .FillUsing(RNNDocGenerator("GRU")));
 
-static const char* LSTM_ver14_doc = R"DOC(
+static const char* LSTM_ver22_doc = R"DOC(
 Computes an one-layer LSTM. This operator is usually supported via some
 custom implementation such as CuDNN.
 
@@ -425,9 +422,9 @@ Equations (Default: f=Sigmoid, g=Tanh, h=Tanh):
 
 ONNX_OPERATOR_SET_SCHEMA(
     LSTM,
-    14,
+    22,
     OpSchema()
-        .SetDoc(GET_OP_DOC_STR(std::string(LSTM_ver14_doc) + GenerateOptionalArgumentsDoc()))
+        .SetDoc(GET_OP_DOC_STR(std::string(LSTM_ver22_doc) + GenerateOptionalArgumentsDoc()))
         .Attr(
             "activations",
             "A list of 3 (or 6 if bidirectional) activation functions "
diff --git a/onnx/defs/rnn/old.cc b/onnx/defs/rnn/old.cc
index 51fe5960f8b..36371ac67b5 100644
--- a/onnx/defs/rnn/old.cc
+++ b/onnx/defs/rnn/old.cc
@@ -5,6 +5,521 @@
 #include "onnx/defs/schema.h"
 
 namespace ONNX_NAMESPACE {
+
+void RNNShapeInference_opset14(InferenceContext& ctx) {
+  TensorShapeProto::Dimension num_directions, seq_length, batch_size, hidden_size;
+
+  auto direction = getAttribute(ctx, "direction", "forward");
+  if ((direction == "forward") || (direction == "reverse"))
+    num_directions.set_dim_value(1);
+  else if (direction == "bidirectional")
+    num_directions.set_dim_value(2);
+  // else leave num_directions unknown in case of incorrect attribute value
+
+  auto hidden_size_value = getAttribute(ctx, "hidden_size", -1);
+  if (hidden_size_value > 0)
+    hidden_size.set_dim_value(hidden_size_value);
+
+  auto layout_value = getAttribute(ctx, "layout", 0);
+
+  if (hasInputShape(ctx, 0)) {
+    auto& first_input_shape = getInputShape(ctx, 0);
+    if (first_input_shape.dim_size() != 3) {
+      fail_shape_inference("First input tensor must have rank 3");
+    }
+    seq_length = first_input_shape.dim((layout_value == 0) ? 0 : 1);
+    batch_size = first_input_shape.dim((layout_value == 0) ? 1 : 0);
+  }
+
+  auto num_outputs = ctx.getNumOutputs();
+
+  if (num_outputs > 0) {
+    // Y
+    propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+    if (layout_value == 0) {
+      auto dims = {seq_length, num_directions, batch_size, hidden_size};
+      updateOutputShape(ctx, 0, dims);
+    } else {
+      auto dims = {batch_size, seq_length, num_directions, hidden_size};
+      updateOutputShape(ctx, 0, dims);
+    }
+  }
+
+  if (num_outputs > 1) {
+    // Y_h
+    propagateElemTypeFromInputToOutput(ctx, 0, 1);
+
+    if (layout_value == 0) {
+      auto dims = {num_directions, batch_size, hidden_size};
+      updateOutputShape(ctx, 1, dims);
+    } else {
+      auto dims = {batch_size, num_directions, hidden_size};
+      updateOutputShape(ctx, 1, dims);
+    }
+  }
+
+  if (num_outputs > 2) {
+    // Y_c : only in the case of LSTM
+    propagateElemTypeFromInputToOutput(ctx, 0, 2);
+
+    if (layout_value == 0) {
+      auto dims = {num_directions, batch_size, hidden_size};
+      updateOutputShape(ctx, 2, dims);
+    } else {
+      auto dims = {batch_size, num_directions, hidden_size};
+      updateOutputShape(ctx, 2, dims);
+    }
+  }
+}
+std::function<void(OpSchema&)> RNNDocGenerator_opset14(const char* /*name*/) {
+  return [=](OpSchema& schema) {
+    schema.Attr(
+        "direction",
+        "Specify if the RNN is forward, reverse, or bidirectional. "
+        "Must be one of forward (default), reverse, or bidirectional.",
+        AttributeProto::STRING,
+        std::string("forward"));
+    schema.Attr(
+        "layout",
+        "The shape format of inputs X, initial_h and outputs Y, Y_h. "
+        "If 0, the following shapes are expected: "
+        "X.shape = [seq_length, batch_size, input_size], "
+        "Y.shape = [seq_length, num_directions, batch_size, hidden_size], "
+        "initial_h.shape = Y_h.shape = [num_directions, batch_size, hidden_size]. "
+        "If 1, the following shapes are expected: "
+        "X.shape = [batch_size, seq_length, input_size], "
+        "Y.shape = [batch_size, seq_length, num_directions, hidden_size], "
+        "initial_h.shape = Y_h.shape = [batch_size, num_directions, hidden_size].",
+        AttributeProto::INT,
+        static_cast<int64_t>(0));
+    schema.Attr("hidden_size", "Number of neurons in the hidden layer", AttributeProto::INT, OPTIONAL_VALUE);
+    schema.Attr(
+        "activation_alpha",
+        "Optional scaling values used by some activation functions. The values "
+        "are consumed in the order of activation functions, for example (f, g, h) "
+        "in LSTM. Default values are the same as of corresponding ONNX operators."
+        "For example with LeakyRelu, the default alpha is 0.01.",
+        AttributeProto::FLOATS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "activation_beta",
+        "Optional scaling values used by some activation functions. The values "
+        "are consumed in the order of activation functions, for example (f, g, h) "
+        "in LSTM. Default values are the same as of corresponding ONNX operators.",
+        AttributeProto::FLOATS,
+        OPTIONAL_VALUE);
+    schema.Attr(
+        "clip",
+        "Cell clip threshold. Clipping bounds the elements of a tensor "
+        "in the range of [-threshold, +threshold] and is applied to the input "
+        "of activations. No clip if not specified.",
+        AttributeProto::FLOAT,
+        OPTIONAL_VALUE);
+    schema.Input(
+        0,
+        "X",
+        "The input sequences packed (and potentially padded) into one 3-D "
+        "tensor with the shape of `[seq_length, batch_size, input_size]`.",
+        "T",
+        OpSchema::Single,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Input(
+        4,
+        "sequence_lens",
+        "Optional tensor specifying lengths of the sequences in a batch. "
+        "If not specified - assumed all sequences in the batch to have "
+        "length `seq_length`. It has shape `[batch_size]`.",
+        "T1",
+        OpSchema::Optional,
+        true,
+        1,
+        OpSchema::NonDifferentiable);
+    schema.Input(
+        5,
+        "initial_h",
+        "Optional initial value of the hidden. If not specified - assumed "
+        "to be 0. It has shape `[num_directions, batch_size, hidden_size]`.",
+        "T",
+        OpSchema::Optional,
+        true,
+        1,
+        OpSchema::NonDifferentiable);
+    schema.Output(
+        0,
+        "Y",
+        "A tensor that concats all the intermediate output values of the hidden. "
+        "It has shape `[seq_length, num_directions, batch_size, hidden_size]`. ",
+        "T",
+        OpSchema::Optional,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.Output(
+        1,
+        "Y_h",
+        "The last output value of the hidden. It has shape "
+        "`[num_directions, batch_size, hidden_size]`.",
+        "T",
+        OpSchema::Optional,
+        true,
+        1,
+        OpSchema::Differentiable);
+    schema.TypeConstraint(
+        "T",
+        {"tensor(float16)", "tensor(float)", "tensor(double)"},
+        "Constrain input and output types to float tensors.");
+    schema.TypeConstraint("T1", {"tensor(int32)"}, "Constrain seq_lens to integer tensor.");
+    schema.TypeAndShapeInferenceFunction(RNNShapeInference_opset14);
+  };
+}
+
+static const char* GRU_ver14_doc = R"DOC(
+Computes an one-layer GRU. This operator is usually supported via some custom
+implementation such as CuDNN.
+
+Notations:
+
+* `X` - input tensor
+* `z` - update gate
+* `r` - reset gate
+* `h` - hidden gate
+* `t` - time step (t-1 means previous time step)
+* `W[zrh]` - W parameter weight matrix for update, reset, and hidden gates
+* `R[zrh]` - R recurrence weight matrix for update, reset, and hidden gates
+* `Wb[zrh]` - W bias vectors for update, reset, and hidden gates
+* `Rb[zrh]` - R bias vectors for update, reset, and hidden gates
+* `WB[zrh]` - W parameter weight matrix for backward update, reset, and hidden gates
+* `RB[zrh]` - R recurrence weight matrix for backward update, reset, and hidden gates
+* `WBb[zrh]` - W bias vectors for backward update, reset, and hidden gates
+* `RBb[zrh]` - R bias vectors for backward update, reset, and hidden gates
+* `H` - Hidden state
+* `num_directions` - 2 if direction == bidirectional else 1
+
+Activation functions:
+
+* Relu(x)                - max(0, x)
+* Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
+* Sigmoid(x)             - 1/(1 + e^{-x})
+
+NOTE:
+  Below are optional
+
+* Affine(x)              - alpha * x + beta
+* LeakyRelu(x)           - x if x >= 0 else alpha * x
+* ThresholdedRelu(x)     - x if x >= alpha else 0
+* ScaledTanh(x)          - alpha * Tanh(beta * x)
+* HardSigmoid(x)         - min(max(alpha * x + beta, 0), 1)
+* Elu(x)                 - x if x >= 0 else alpha * (e^x - 1)
+* Softsign(x)            - x/(1 + |x|)
+* Softplus(x)            - log(1 + e^x)
+
+Equations (Default: f=Sigmoid, g=Tanh):
+
+* zt = f(Xt*(Wz^T) + Ht-1*(Rz^T) + Wbz + Rbz)
+* rt = f(Xt*(Wr^T) + Ht-1*(Rr^T) + Wbr + Rbr)
+* ht = g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh) # default, when linear_before_reset = 0
+* ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh) # when linear_before_reset != 0
+* Ht = (1 - zt) (.) ht + zt (.) Ht-1
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    GRU,
+    14,
+    OpSchema()
+        .SetDoc(GET_OP_DOC_STR(std::string(GRU_ver14_doc) + GenerateOptionalArgumentsDoc()))
+        .Attr(
+            "activations",
+            "A list of 2 (or 4 if bidirectional) activation functions "
+            "for update, reset, and hidden gates. The activation functions must be one "
+            "of the activation functions specified above. Optional: See the equations "
+            "for default if not specified.",
+            AttributeProto::STRINGS,
+            OPTIONAL_VALUE)
+        .Attr(
+            "linear_before_reset",
+            "When computing the output of the hidden gate, "
+            "apply the linear transformation before multiplying by the output of the "
+            "reset gate.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Input(
+            1,
+            "W",
+            "The weight tensor for the gates. Concatenation of `W[zrh]` and `WB[zrh]` "
+            "(if bidirectional) along dimension 0. This tensor has shape "
+            "`[num_directions, 3*hidden_size, input_size]`.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            2,
+            "R",
+            "The recurrence weight tensor. Concatenation of `R[zrh]` and `RB[zrh]` "
+            "(if bidirectional) along dimension 0. This tensor has shape "
+            "`[num_directions, 3*hidden_size, hidden_size]`.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            3,
+            "B",
+            "The bias tensor for the gates. Concatenation of `[Wb[zrh], Rb[zrh]]` and "
+            "`[WBb[zrh], RBb[zrh]]` (if bidirectional) along dimension 0. This tensor "
+            "has shape `[num_directions, 6*hidden_size]`. Optional: If not specified "
+            "- assumed to be 0",
+            "T",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .FillUsing(RNNDocGenerator_opset14("GRU")));
+
+static const char* LSTM_ver14_doc = R"DOC(
+Computes an one-layer LSTM. This operator is usually supported via some
+custom implementation such as CuDNN.
+
+Notations:
+
+* `X` - input tensor
+* `i` - input gate
+* `o` - output gate
+* `f` - forget gate
+* `c` - cell gate
+* `t` - time step (t-1 means previous time step)
+* `W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates
+* `R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates
+* `Wb[iofc]` - W bias vectors for input, output, forget, and cell gates
+* `Rb[iofc]` - R bias vectors for input, output, forget, and cell gates
+* `P[iof]`  - P peephole weight vector for input, output, and forget gates
+* `WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates
+* `RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates
+* `WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates
+* `RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates
+* `PB[iof]`  - P peephole weight vector for backward input, output, and forget gates
+* `H` - Hidden state
+* `num_directions` - 2 if direction == bidirectional else 1
+
+Activation functions:
+
+* Relu(x)                - max(0, x)
+* Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
+* Sigmoid(x)             - 1/(1 + e^{-x})
+
+NOTE: Below are optional
+
+* Affine(x)              - alpha*x + beta
+* LeakyRelu(x)           - x if x >= 0 else alpha * x
+* ThresholdedRelu(x)     - x if x >= alpha else 0
+* ScaledTanh(x)          - alpha*Tanh(beta*x)
+* HardSigmoid(x)         - min(max(alpha*x + beta, 0), 1)
+* Elu(x)                 - x if x >= 0 else alpha*(e^x - 1)
+* Softsign(x)            - x/(1 + |x|)
+* Softplus(x)            - log(1 + e^x)
+
+Equations (Default: f=Sigmoid, g=Tanh, h=Tanh):
+
+* it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
+* ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
+* ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
+* Ct = ft (.) Ct-1 + it (.) ct
+* ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
+* Ht = ot (.) h(Ct)
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    LSTM,
+    14,
+    OpSchema()
+        .SetDoc(GET_OP_DOC_STR(std::string(LSTM_ver14_doc) + GenerateOptionalArgumentsDoc()))
+        .Attr(
+            "activations",
+            "A list of 3 (or 6 if bidirectional) activation functions "
+            "for input, output, forget, cell, and hidden. The activation functions must "
+            "be one of the activation functions specified above. Optional: See the equations "
+            "for default if not specified.",
+            AttributeProto::STRINGS,
+            OPTIONAL_VALUE)
+        .Attr(
+            "layout",
+            "The shape format of inputs X, initial_h, initial_c and outputs Y, Y_h, Y_c. "
+            "If 0, the following shapes are expected: "
+            "X.shape = [seq_length, batch_size, input_size], "
+            "Y.shape = [seq_length, num_directions, batch_size, hidden_size], "
+            "initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = "
+            "[num_directions, batch_size, hidden_size]. "
+            "If 1, the following shapes are expected: "
+            "X.shape = [batch_size, seq_length, input_size], "
+            "Y.shape = [batch_size, seq_length, num_directions, hidden_size], "
+            "initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = "
+            "[batch_size, num_directions, hidden_size].",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Attr("input_forget", "Couple the input and forget gates if 1.", AttributeProto::INT, static_cast<int64_t>(0))
+        .Input(
+            1,
+            "W",
+            "The weight tensor for the gates. Concatenation of `W[iofc]` and "
+            "`WB[iofc]` (if bidirectional) along dimension 0. The tensor has shape "
+            "`[num_directions, 4*hidden_size, input_size]`.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            2,
+            "R",
+            "The recurrence weight tensor. Concatenation of `R[iofc]` and "
+            "`RB[iofc]` (if bidirectional) along dimension 0. This tensor has shape "
+            "`[num_directions, 4*hidden_size, hidden_size]`.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            3,
+            "B",
+            "The bias tensor for input gate. Concatenation of `[Wb[iofc], Rb[iofc]]`, "
+            "and `[WBb[iofc], RBb[iofc]]` (if bidirectional) along dimension 0. This "
+            "tensor has shape `[num_directions, 8*hidden_size]`. Optional: If not "
+            "specified - assumed to be 0.",
+            "T",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            6,
+            "initial_c",
+            "Optional initial value of the cell. If not specified - assumed "
+            "to be 0. It has shape `[num_directions, batch_size, hidden_size]`.",
+            "T",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Input(
+            7,
+            "P",
+            "The weight tensor for peepholes. Concatenation of `P[iof]` and "
+            "`PB[iof]` (if bidirectional) along dimension 0. It has shape "
+            "`[num_directions, 3*hidde_size]`. Optional: If not specified - "
+            "assumed to be 0.",
+            "T",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .FillUsing(RNNDocGenerator_opset14("LSTM"))
+        .Output(
+            2,
+            "Y_c",
+            "The last output value of the cell. It has shape "
+            "`[num_directions, batch_size, hidden_size]`.",
+            "T",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::Differentiable));
+
+static const char* RNN_ver14_doc = R"DOC(
+Computes an one-layer simple RNN. This operator is usually supported
+via some custom implementation such as CuDNN.
+
+Notations:
+
+* `X` - input tensor
+* `i` - input gate
+* `t` - time step (t-1 means previous time step)
+* `Wi` - W parameter weight matrix for input gate
+* `Ri` - R recurrence weight matrix for input gate
+* `Wbi` - W parameter bias vector for input gate
+* `Rbi` - R parameter bias vector for input gate
+* `WBi` - W parameter weight matrix for backward input gate
+* `RBi` - R recurrence weight matrix for backward input gate
+* `WBbi` - WR bias vectors for backward input gate
+* `RBbi` - RR bias vectors for backward input gate
+* `H` - Hidden state
+* `num_directions` - 2 if direction == bidirectional else 1
+
+Activation functions:
+
+* Relu(x)                - max(0, x)
+* Tanh(x)                - (1 - e^{-2x})/(1 + e^{-2x})
+* Sigmoid(x)             - 1/(1 + e^{-x})
+
+NOTE: Below are optional
+
+* Affine(x)              - alpha*x + beta
+* LeakyRelu(x)           - x if x >= 0 else alpha * x
+* ThresholdedRelu(x)     - x if x >= alpha else 0
+* ScaledTanh(x)          - alpha*Tanh(beta*x)
+* HardSigmoid(x)         - min(max(alpha*x + beta, 0), 1)
+* Elu(x)                 - x if x >= 0 else alpha*(e^x - 1)
+* Softsign(x)            - x/(1 + |x|)
+* Softplus(x)            - log(1 + e^x)
+
+Equations (Default: f=Tanh):
+
+* Ht = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Wbi + Rbi)
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    RNN,
+    14,
+    OpSchema()
+        .SetDoc(GET_OP_DOC_STR(std::string(RNN_ver14_doc) + GenerateOptionalArgumentsDoc()))
+        .Attr(
+            "activations",
+            "One (or two if bidirectional) activation function for "
+            "input gate. The activation function must be one of the activation "
+            "functions specified above. Optional: Default `Tanh` if not specified.",
+            AttributeProto::STRINGS,
+            std::vector<std::string>{"Tanh", "Tanh"})
+        .Input(
+            1,
+            "W",
+            "The weight tensor for input gate. Concatenation of `Wi` and `WBi` "
+            "(if bidirectional). The tensor has shape "
+            "`[num_directions, hidden_size, input_size]`.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            2,
+            "R",
+            "The recurrence weight tensor. Concatenation of `Ri` and `RBi` "
+            "(if bidirectional). The tensor has shape "
+            "`[num_directions, hidden_size, hidden_size]`.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            3,
+            "B",
+            "The bias tensor for input gate. Concatenation of `[Wbi, Rbi]` "
+            "and `[WBbi, RBbi]` (if bidirectional). The tensor has shape "
+            "`[num_directions, 2*hidden_size]`. Optional: If not specified - assumed "
+            "to be 0.",
+            "T",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .FillUsing(RNNDocGenerator_opset14("RNN")));
+
 std::function<void(OpSchema&)> RNNDocGeneratorOld(const char* /*name*/) {
   return [=](OpSchema& schema) {
     schema.Attr(
@@ -243,8 +758,8 @@ void RNNShapeInference1(InferenceContext& ctx) {
     // Documentation suggests that the output Y is absent in this case
     // Different tests seem to disagree on whether Y_h and Y_c, if present,
     // should be in positions 0 & 1 or 1 & 2. updateOutputShape(ctx, 0,
-    // {num_directions, batch_size, hidden_size}); // Y_h if (num_outputs > 1)
-    // updateOutputShape(ctx, 1, {num_directions, batch_size, hidden_size}); //
+    // {num_directions, batch_size, hidden_size});  // Y_h if (num_outputs > 1)
+    // updateOutputShape(ctx, 1, {num_directions, batch_size, hidden_size});  //
     // Y_c
   }
 }
diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
index c012e9e8f13..54e1b93fdde 100644
--- a/onnx/defs/schema.h
+++ b/onnx/defs/schema.h
@@ -764,12 +764,36 @@ class OpSchema final {
     return all_tensor_types_ir4;
   }
 
+  static const std::vector<std::string>& all_non_complex_numeric_types_plus_bool_ir4() {
+    static const std::vector<std::string> all_non_complex_numeric_types_plus_bool_ir4 = {
+        "tensor(uint8)",
+        "tensor(uint16)",
+        "tensor(uint32)",
+        "tensor(uint64)",
+        "tensor(int8)",
+        "tensor(int16)",
+        "tensor(int32)",
+        "tensor(int64)",
+        "tensor(bfloat16)",
+        "tensor(float16)",
+        "tensor(float)",
+        "tensor(double)",
+        "tensor(bool)"};
+    return all_non_complex_numeric_types_plus_bool_ir4;
+  }
+
   static const std::vector<std::string>& all_float_types_ir4() {
     static const std::vector<std::string> all_float_types_ir4 = {
         "tensor(bfloat16)", "tensor(float16)", "tensor(float)", "tensor(double)"};
     return all_float_types_ir4;
   }
 
+  static const std::vector<std::string>& all_float_types_plus_Xint8_ir4() {
+    static const std::vector<std::string> all_float_types_ir4 = {
+        "tensor(bfloat16)", "tensor(float16)", "tensor(float)", "tensor(double)", "tensor(int8)", "tensor(uint8)"};
+    return all_float_types_ir4;
+  }
+
   static const std::vector<std::string>& all_float_types_ir9() {
     static const std::vector<std::string> all_float_types_ir9 = {
         "tensor(bfloat16)",
@@ -810,6 +834,16 @@ class OpSchema final {
     return all_tensor_types_ir10;
   }
 
+  static const std::vector<std::string>& all_non_complex_tensor_types_ir10() {
+    static const std::vector<std::string> all_non_complex_tensor_types_ir10 = {
+        "tensor(uint8)",      "tensor(uint16)",         "tensor(uint32)",       "tensor(uint64)",
+        "tensor(int8)",       "tensor(int16)",          "tensor(int32)",        "tensor(int64)",
+        "tensor(bfloat16)",   "tensor(float16)",        "tensor(float)",        "tensor(double)",
+        "tensor(string)",     "tensor(bool)",           "tensor(float8e4m3fn)", "tensor(float8e4m3fnuz)",
+        "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",        "tensor(int4)"};
+    return all_non_complex_tensor_types_ir10;
+  }
+
   static const std::vector<std::string>& all_tensor_sequence_types() {
     static const std::vector<std::string> all_tensor_sequence_types = {
         "seq(tensor(uint8))",
@@ -1176,7 +1210,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
       // operator schema on specific domain. Update the lowest version when it's
       // determined to remove too old version history.
       map_[ONNX_DOMAIN] = std::make_pair(1, 22);
-      map_[AI_ONNX_ML_DOMAIN] = std::make_pair(1, 5);
+      map_[AI_ONNX_ML_DOMAIN] = std::make_pair(1, 6);
       map_[AI_ONNX_TRAINING_DOMAIN] = std::make_pair(1, 1);
       // ONNX's preview domain contains operators subject to change, so
       // versining is not meaningful and that domain should have only one
@@ -1185,8 +1219,8 @@ class OpSchemaRegistry final : public ISchemaRegistry {
       // Version corresponding last release of ONNX. Update this to match with
       // the max version above in a *release* version of ONNX. But in other
       // versions, the max version may be ahead of the last-release-version.
-      last_release_version_map_[ONNX_DOMAIN] = 21;
-      last_release_version_map_[AI_ONNX_ML_DOMAIN] = 5;
+      last_release_version_map_[ONNX_DOMAIN] = 22;
+      last_release_version_map_[AI_ONNX_ML_DOMAIN] = 6;
       last_release_version_map_[AI_ONNX_TRAINING_DOMAIN] = 1;
       last_release_version_map_[AI_ONNX_PREVIEW_TRAINING_DOMAIN] = 2;
     }
diff --git a/onnx/defs/tensor/defs.cc b/onnx/defs/tensor/defs.cc
index 5cad024803d..c27620db5ee 100644
--- a/onnx/defs/tensor/defs.cc
+++ b/onnx/defs/tensor/defs.cc
@@ -136,7 +136,7 @@ ONNX_OPERATOR_SET_SCHEMA(
           PropagateShapeDataFromInputToOutput(ctx, 0);
         }));
 
-static const char* CastLike_ver19_doc = R"DOC(
+static const char* CastLike_ver21_doc = R"DOC(
 The operator casts the elements of a given input tensor (the first input) to
 the same data type as the elements of the second input tensor.
 See documentation of the Cast operator for further details.
@@ -146,7 +146,7 @@ ONNX_OPERATOR_SET_SCHEMA(
     CastLike,
     21,
     OpSchema()
-        .SetDoc(CastLike_ver19_doc)
+        .SetDoc(CastLike_ver21_doc)
         .Attr(
             "saturate",
             "The parameter defines how the conversion behaves if an input value is out of "
@@ -176,19 +176,11 @@ ONNX_OPERATOR_SET_SCHEMA(
             OpSchema::Differentiable)
         .TypeConstraint(
             "T1",
-            {"tensor(float16)",    "tensor(float)",          "tensor(double)",       "tensor(int8)",
-             "tensor(int16)",      "tensor(int32)",          "tensor(int64)",        "tensor(uint8)",
-             "tensor(uint16)",     "tensor(uint32)",         "tensor(uint64)",       "tensor(bool)",
-             "tensor(string)",     "tensor(bfloat16)",       "tensor(float8e4m3fn)", "tensor(float8e4m3fnuz)",
-             "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",        "tensor(int4)"},
+            OpSchema::all_non_complex_tensor_types_ir10(),
             "Constrain input types. Casting from complex is not supported.")
         .TypeConstraint(
             "T2",
-            {"tensor(float16)",    "tensor(float)",          "tensor(double)",       "tensor(int8)",
-             "tensor(int16)",      "tensor(int32)",          "tensor(int64)",        "tensor(uint8)",
-             "tensor(uint16)",     "tensor(uint32)",         "tensor(uint64)",       "tensor(bool)",
-             "tensor(string)",     "tensor(bfloat16)",       "tensor(float8e4m3fn)", "tensor(float8e4m3fnuz)",
-             "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",        "tensor(int4)"},
+            OpSchema::all_non_complex_tensor_types_ir10(),
             "Constrain output types. Casting to complex is not supported.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 1, 0);
@@ -2324,7 +2316,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         .SetDoc(Resize_ver19_doc)
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { resizeShapeInference_opset18_to_19(ctx); }));
 
-static const char* GridSample_ver20_doc = R"DOC(
+static const char* GridSample_ver22_doc = R"DOC(
 Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from the `grid`.
 For spatial input `X` with shape (N, C, H, W), the `grid` will have shape (N, H_out, W_out, 2),
 the output `Y` will have shape (N, C, H_out, W_out). For volumetric input `X` with shape (N, C, D, H, W),
@@ -2347,7 +2339,7 @@ See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/ge
 
 ONNX_OPERATOR_SET_SCHEMA(
     GridSample,
-    20,
+    22,
     OpSchema()
         .Attr(
             "mode",
@@ -2413,13 +2405,10 @@ ONNX_OPERATOR_SET_SCHEMA(
             OpSchema::Differentiable)
         .TypeConstraint(
             "T1",
-            OpSchema::all_tensor_types(),
+            OpSchema::all_tensor_types_ir4(),
             "Constrain input `X` and output `Y` types to all tensor types.")
-        .TypeConstraint(
-            "T2",
-            {"tensor(float16)", "tensor(float)", "tensor(double)"},
-            "Constrain grid types to float tensors.")
-        .SetDoc(GridSample_ver20_doc)
+        .TypeConstraint("T2", OpSchema::all_float_types_ir4(), "Constrain grid types to float tensors.")
+        .SetDoc(GridSample_ver22_doc)
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { gridSampleShapeInference(ctx); }));
 
 static const char* AffineGrid_ver20_doc = R"DOC(
diff --git a/onnx/defs/tensor/old.cc b/onnx/defs/tensor/old.cc
index b7188b84b46..3dbcc756018 100644
--- a/onnx/defs/tensor/old.cc
+++ b/onnx/defs/tensor/old.cc
@@ -13,6 +13,104 @@
 
 namespace ONNX_NAMESPACE {
 
+static const char* GridSample_ver20_doc = R"DOC(
+Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from the `grid`.
+For spatial input `X` with shape (N, C, H, W), the `grid` will have shape (N, H_out, W_out, 2),
+the output `Y` will have shape (N, C, H_out, W_out). For volumetric input `X` with shape (N, C, D, H, W),
+the `grid` will have shape (N, D_out, H_out, W_out, 3), the output `Y` will have shape (N, C, D_out, H_out, W_out).
+More generally, for an input `X` of rank r+2 with shape (N, C, d1, d2, ..., dr),
+the `grid` will have shape (N, D1_out, D2_out, ..., Dr_out, r), the output `Y` will have shape (N, C, D1_out, D2_out, ..., Dr_out).
+
+The tensor `X` contains values at centers of square pixels (voxels, etc) locations such as (n, c, d1_in, d2_in, ..., dr_in).
+The (n, d1_out, d2_out, ..., dr_out, :) values from the tensor `grid` are the normalized positions for interpolating the values
+at the (n, c, d1_out, d2_out, ..., dr_out) locations from the output tensor `Y` using a specified interpolation method (the mode)
+and a padding mode (for `grid` positions falling outside the 2-dimensional image).
+
+For example, the values in `grid[n, h_out, w_out, :]` are size-2 vectors specifying normalized positions in the 2-dimensional space of `X`.
+They are used to interpolate output values of `Y[n, c, h_out, w_out]`.
+
+The GridSample operator is often used in doing grid generator and sampler in the
+[Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
+See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html).
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    GridSample,
+    20,
+    OpSchema()
+        .Attr(
+            "mode",
+            "Three interpolation modes: linear (default), nearest and cubic. "
+            "The \"linear\" mode includes linear and N-linear interpolation modes depending on the number of spatial dimensions "
+            "of the input tensor (i.e. linear for 1 spatial dimension, bilinear for 2 spatial dimensions, etc.). "
+            "The \"cubic\" mode also includes N-cubic interpolation modes following the same rules. The \"nearest\" mode rounds "
+            "to the nearest even index when the sampling point falls halfway between two indices.",
+            AttributeProto::STRING,
+            std::string("linear"))
+        .Attr(
+            "padding_mode",
+            "Support padding modes for outside grid values: `zeros`(default), `border`, `reflection`. "
+            "zeros: use 0 for out-of-bound grid locations, "
+            "border: use border values for out-of-bound grid locations, "
+            "reflection: use values at locations reflected by the border for out-of-bound grid locations. "
+            "If index 0 represents the margin pixel, the reflected value at index -1 will be the same as the value at index 1. "
+            "For location far away from the border, it will keep being reflected until becoming in bound. "
+            "If pixel location x = -3.5 reflects by border -1 and becomes x' = 1.5, then reflects by border 1 and becomes x'' = 0.5.",
+            AttributeProto::STRING,
+            std::string("zeros"))
+        .Attr(
+            "align_corners",
+            "If align_corners=1, the extrema (-1 and 1) are considered as referring to the center points of the input's corner pixels (voxels, etc.). "
+            "If align_corners=0, they are instead considered as referring to the corner points of the input's corner pixels (voxels, etc.), "
+            "making the sampling more resolution agnostic.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Input(
+            0,
+            "X",
+            "Input tensor of rank r+2 that has shape (N, C, D1, D2, ..., Dr), where N is the batch size, "
+            "C is the number of channels, D1, D2, ..., Dr are the spatial dimensions.",
+            "T1",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            1,
+            "grid",
+            "Input offset of shape (N, D1_out, D2_out, ..., Dr_out, r), where D1_out, D2_out, ..., "
+            "Dr_out are the spatial dimensions of the grid and output, and r is the number of spatial dimensions. "
+            "Grid specifies the sampling locations normalized by the input spatial dimensions. "
+            "Therefore, it should have most values in the range of [-1, 1]. If the grid has values outside the range of [-1, 1], "
+            "the corresponding outputs will be handled as defined by padding_mode. Following computer vision convention, "
+            "the coordinates in the length-r location vector are listed from the innermost tensor dimension to the outermost, "
+            "the opposite of regular tensor indexing.",
+            "T2",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(
+            0,
+            "Y",
+            "Output tensor of rank r+2 that has shape (N, C, D1_out, D2_out, ..., Dr_out) of the sampled values. "
+            "For integer input types, intermediate values are computed as floating point and cast to integer at the end.",
+            "T1",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T1",
+            OpSchema::all_tensor_types(),
+            "Constrain input `X` and output `Y` types to all tensor types.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)", "tensor(float)", "tensor(double)"},
+            "Constrain grid types to float tensors.")
+        .SetDoc(GridSample_ver20_doc)
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { gridSampleShapeInference(ctx); }));
+
 static const char* Cast_ver19_doc = R"DOC(
 The operator casts the elements of a given input tensor to a data type
 specified by the 'to' argument and returns an output tensor of the same size in
diff --git a/onnx/helper.py b/onnx/helper.py
index 0b6512c769b..fc824c09907 100644
--- a/onnx/helper.py
+++ b/onnx/helper.py
@@ -76,7 +76,7 @@
     ("1.14.1", 9, 19, 3, 1),
     ("1.15.0", 9, 20, 4, 1),
     ("1.16.0", 10, 21, 5, 1),
-    ("1.17.0", 10, 21, 5, 1),
+    ("1.17.0", 10, 22, 6, 1),
 ]
 
 VersionMapType = Dict[Tuple[str, int], int]
diff --git a/onnx/test/cpp/schema_registration_test.cc b/onnx/test/cpp/schema_registration_test.cc
index 5ca90cf85ef..3745cb7c2ab 100644
--- a/onnx/test/cpp/schema_registration_test.cc
+++ b/onnx/test/cpp/schema_registration_test.cc
@@ -73,11 +73,11 @@ TEST(SchemaRegistrationTest, RegisterAndDeregisterAllOpsetSchemaVersion) {
   // Acos-7
   // Add-1,6,7,13,14
   // Trilu-14
-  auto schema = OpSchemaRegistry::Schema("Acos");
+  auto schema = OpSchemaRegistry::Schema("Acos", 7);
   EXPECT_NE(nullptr, schema);
   EXPECT_EQ(schema->SinceVersion(), 7);
 
-  schema = OpSchemaRegistry::Schema("Add");
+  schema = OpSchemaRegistry::Schema("Add", 14);
   EXPECT_NE(nullptr, schema);
   EXPECT_EQ(schema->SinceVersion(), 14);
 
@@ -121,7 +121,7 @@ TEST(SchemaRegistrationTest, RegisterSpecifiedOpsetSchemaVersion) {
   EXPECT_EQ(nullptr, opSchema);
 
   // Acos-7 is the latest Acos before specified 13
-  opSchema = OpSchemaRegistry::Schema("Acos");
+  opSchema = OpSchemaRegistry::Schema("Acos", 13);
   EXPECT_NE(nullptr, opSchema);
   EXPECT_EQ(opSchema->SinceVersion(), 7);
 #endif
diff --git a/onnx/test/test_backend_onnxruntime.py b/onnx/test/test_backend_onnxruntime.py
index df701a052de..4fdc5c7d834 100644
--- a/onnx/test/test_backend_onnxruntime.py
+++ b/onnx/test/test_backend_onnxruntime.py
@@ -185,6 +185,268 @@ def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
         "|test_transpose_"
         "|test_unsqueeze_"
         "|test_wrap_pad_"
+        "|test_acos_cpu"
+        "|test_acos_example_cpu"
+        "|test_acosh_cpu"
+        "|test_acosh_example_cpu"
+        "|test_asin_cpu"
+        "|test_asin_example_cpu"
+        "|test_asinh_cpu"
+        "|test_asinh_example_cpu"
+        "|test_atan_cpu"
+        "|test_atan_example_cpu"
+        "|test_atanh_cpu"
+        "|test_atanh_example_cpu"
+        "|test_averagepool_1d_default_cpu"
+        "|test_averagepool_2d_ceil_cpu"
+        "|test_averagepool_2d_default_cpu"
+        "|test_averagepool_2d_dilations_cpu"
+        "|test_averagepool_2d_pads_count_include_pad_cpu"
+        "|test_averagepool_2d_pads_cpu"
+        "|test_averagepool_2d_precomputed_pads_count_include_pad_cpu"
+        "|test_averagepool_2d_precomputed_pads_cpu"
+        "|test_averagepool_2d_precomputed_same_upper_cpu"
+        "|test_averagepool_2d_precomputed_strides_cpu"
+        "|test_averagepool_2d_same_lower_cpu"
+        "|test_averagepool_2d_same_upper_cpu"
+        "|test_averagepool_2d_strides_cpu"
+        "|test_averagepool_3d_default_cpu"
+        "|test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_False_cpu"
+        "|test_averagepool_3d_dilations_large_count_include_pad_is_0_ceil_mode_is_True_cpu"
+        "|test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_False_cpu"
+        "|test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True_cpu"
+        "|test_averagepool_3d_dilations_small_cpu"
+        "|test_basic_conv_with_padding_cpu"
+        "|test_basic_conv_without_padding_cpu"
+        "|test_conv_with_autopad_same_cpu"
+        "|test_conv_with_strides_and_asymmetric_padding_cpu"
+        "|test_conv_with_strides_no_padding_cpu"
+        "|test_conv_with_strides_padding_cpu"
+        "|test_convtranspose_1d_cpu"
+        "|test_convtranspose_3d_cpu"
+        "|test_convtranspose_autopad_same_cpu"
+        "|test_convtranspose_cpu"
+        "|test_convtranspose_dilations_cpu"
+        "|test_convtranspose_kernel_shape_cpu"
+        "|test_convtranspose_output_shape_cpu"
+        "|test_convtranspose_pad_cpu"
+        "|test_convtranspose_pads_cpu"
+        "|test_cos_cpu"
+        "|test_cos_example_cpu"
+        "|test_cosh_cpu"
+        "|test_cosh_example_cpu"
+        "|test_det_2d_cpu"
+        "|test_det_nd_cpu"
+        "|test_dropout_default_cpu"
+        "|test_dropout_default_mask_cpu"
+        "|test_dropout_default_mask_ratio_cpu"
+        "|test_dropout_default_ratio_cpu"
+        "|test_elu_cpu"
+        "|test_elu_default_cpu"
+        "|test_elu_example_cpu"
+        "|test_eyelike_populate_off_main_diagonal_cpu"
+        "|test_eyelike_with_dtype_cpu"
+        "|test_eyelike_without_dtype_cpu"
+        "|test_globalaveragepool_cpu"
+        "|test_globalaveragepool_precomputed_cpu"
+        "|test_gridsample_aligncorners_true_cpu"
+        "|test_gridsample_bicubic_align_corners_0_additional_1_cpu"
+        "|test_gridsample_bicubic_align_corners_1_additional_1_cpu"
+        "|test_gridsample_bicubic_cpu"
+        "|test_gridsample_bilinear_align_corners_0_additional_1_cpu"
+        "|test_gridsample_bilinear_align_corners_1_additional_1_cpu"
+        "|test_gridsample_bilinear_cpu"
+        "|test_gridsample_border_padding_cpu"
+        "|test_gridsample_cpu"
+        "|test_gridsample_nearest_align_corners_0_additional_1_cpu"
+        "|test_gridsample_nearest_align_corners_1_additional_1_cpu"
+        "|test_gridsample_nearest_cpu"
+        "|test_gridsample_reflection_padding_cpu"
+        "|test_gridsample_volumetric_bilinear_align_corners_0_cpu"
+        "|test_gridsample_volumetric_bilinear_align_corners_1_cpu"
+        "|test_gridsample_volumetric_nearest_align_corners_0_cpu"
+        "|test_gridsample_volumetric_nearest_align_corners_1_cpu"
+        "|test_gridsample_zeros_padding_cpu"
+        "|test_gru_defaults_cpu"
+        "|test_gru_seq_length_cpu"
+        "|test_gru_with_initial_bias_cpu"
+        "|test_hardsigmoid_cpu"
+        "|test_hardsigmoid_default_cpu"
+        "|test_hardsigmoid_example_cpu"
+        "|test_hardswish_cpu"
+        "|test_hardswish_expanded_cpu"
+        "|test_lppool_1d_default_cpu"
+        "|test_lppool_2d_default_cpu"
+        "|test_lppool_2d_dilations_cpu"
+        "|test_lppool_2d_pads_cpu"
+        "|test_lppool_2d_same_lower_cpu"
+        "|test_lppool_2d_same_upper_cpu"
+        "|test_lppool_2d_strides_cpu"
+        "|test_lppool_3d_default_cpu"
+        "|test_lstm_defaults_cpu"
+        "|test_lstm_with_initial_bias_cpu"
+        "|test_lstm_with_peepholes_cpu"
+        "|test_maxpool_1d_default_cpu"
+        "|test_maxpool_2d_ceil_cpu"
+        "|test_maxpool_2d_default_cpu"
+        "|test_maxpool_2d_dilations_cpu"
+        "|test_maxpool_2d_pads_cpu"
+        "|test_maxpool_2d_precomputed_pads_cpu"
+        "|test_maxpool_2d_precomputed_same_upper_cpu"
+        "|test_maxpool_2d_precomputed_strides_cpu"
+        "|test_maxpool_2d_same_lower_cpu"
+        "|test_maxpool_2d_same_upper_cpu"
+        "|test_maxpool_2d_strides_cpu"
+        "|test_maxpool_2d_uint8_cpu"
+        "|test_maxpool_3d_default_cpu"
+        "|test_maxpool_3d_dilations_cpu"
+        "|test_maxpool_3d_dilations_use_ref_impl_cpu"
+        "|test_maxpool_3d_dilations_use_ref_impl_large_cpu"
+        "|test_maxpool_with_argmax_2d_precomputed_pads_cpu"
+        "|test_maxpool_with_argmax_2d_precomputed_strides_cpu"
+        "|test_maxunpool_export_without_output_shape_cpu"
+        "|test_mish_cpu"
+        "|test_mish_expanded_cpu"
+        "|test_nllloss_NC_cpu"
+        "|test_nllloss_NC_expanded_cpu"
+        "|test_nllloss_NCd1_cpu"
+        "|test_nllloss_NCd1_expanded_cpu"
+        "|test_nllloss_NCd1_ii_cpu"
+        "|test_nllloss_NCd1_ii_expanded_cpu"
+        "|test_nllloss_NCd1_mean_weight_negative_ii_cpu"
+        "|test_nllloss_NCd1_mean_weight_negative_ii_expanded_cpu"
+        "|test_nllloss_NCd1_weight_cpu"
+        "|test_nllloss_NCd1_weight_expanded_cpu"
+        "|test_nllloss_NCd1_weight_ii_cpu"
+        "|test_nllloss_NCd1_weight_ii_expanded_cpu"
+        "|test_nllloss_NCd1d2_cpu"
+        "|test_nllloss_NCd1d2_expanded_cpu"
+        "|test_nllloss_NCd1d2_no_weight_reduction_mean_ii_cpu"
+        "|test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded_cpu"
+        "|test_nllloss_NCd1d2_reduction_mean_cpu"
+        "|test_nllloss_NCd1d2_reduction_mean_expanded_cpu"
+        "|test_nllloss_NCd1d2_reduction_sum_cpu"
+        "|test_nllloss_NCd1d2_reduction_sum_expanded_cpu"
+        "|test_nllloss_NCd1d2_with_weight_cpu"
+        "|test_nllloss_NCd1d2_with_weight_expanded_cpu"
+        "|test_nllloss_NCd1d2_with_weight_reduction_mean_cpu"
+        "|test_nllloss_NCd1d2_with_weight_reduction_mean_expanded_cpu"
+        "|test_nllloss_NCd1d2_with_weight_reduction_sum_cpu"
+        "|test_nllloss_NCd1d2_with_weight_reduction_sum_expanded_cpu"
+        "|test_nllloss_NCd1d2_with_weight_reduction_sum_ii_cpu"
+        "|test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded_cpu"
+        "|test_nllloss_NCd1d2d3_none_no_weight_negative_ii_cpu"
+        "|test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded_cpu"
+        "|test_nllloss_NCd1d2d3_sum_weight_high_ii_cpu"
+        "|test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded_cpu"
+        "|test_nllloss_NCd1d2d3d4d5_mean_weight_cpu"
+        "|test_nllloss_NCd1d2d3d4d5_mean_weight_expanded_cpu"
+        "|test_nllloss_NCd1d2d3d4d5_none_no_weight_cpu"
+        "|test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded_cpu"
+        "|test_rnn_seq_length_cpu"
+        "|test_roialign_aligned_false_cpu"
+        "|test_roialign_aligned_true_cpu"
+        "|test_roialign_mode_max_cpu"
+        "|test_round_cpu"
+        "|test_selu_cpu"
+        "|test_selu_default_cpu"
+        "|test_selu_example_cpu"
+        "|test_simple_rnn_defaults_cpu"
+        "|test_simple_rnn_with_initial_bias_cpu"
+        "|test_sin_cpu"
+        "|test_sin_example_cpu"
+        "|test_sinh_cpu"
+        "|test_sinh_example_cpu"
+        "|test_softplus_cpu"
+        "|test_softplus_example_cpu"
+        "|test_softsign_cpu"
+        "|test_softsign_example_cpu"
+        "|test_tan_cpu"
+        "|test_tan_example_cpu"
+        "|test_thresholdedrelu_cpu"
+        "|test_thresholdedrelu_default_cpu"
+        "|test_thresholdedrelu_example_cpu"
+        "|test_resize_downsample_scales_cubic_A_n0p5_exclude_outside_cpu"
+        "|test_resize_downsample_scales_cubic_antialias_cpu"
+        "|test_resize_downsample_scales_cubic_cpu"
+        "|test_resize_downsample_scales_linear_antialias_cpu"
+        "|test_resize_downsample_scales_linear_cpu"
+        "|test_resize_downsample_scales_linear_half_pixel_symmetric_cpu"
+        "|test_resize_downsample_scales_nearest_cpu"
+        "|test_resize_downsample_sizes_cubic_antialias_cpu"
+        "|test_resize_downsample_sizes_cubic_cpu"
+        "|test_resize_downsample_sizes_linear_antialias_cpu"
+        "|test_resize_downsample_sizes_linear_pytorch_half_pixel_cpu"
+        "|test_resize_downsample_sizes_nearest_cpu"
+        "|test_resize_downsample_sizes_nearest_not_larger_cpu"
+        "|test_resize_downsample_sizes_nearest_not_smaller_cpu"
+        "|test_resize_tf_crop_and_resize_axes_2_3_cpu"
+        "|test_resize_tf_crop_and_resize_axes_3_2_cpu"
+        "|test_resize_tf_crop_and_resize_cpu"
+        "|test_resize_upsample_scales_cubic_A_n0p5_exclude_outside_cpu"
+        "|test_resize_upsample_scales_cubic_align_corners_cpu"
+        "|test_resize_upsample_scales_cubic_asymmetric_cpu"
+        "|test_resize_upsample_scales_cubic_cpu"
+        "|test_resize_upsample_scales_linear_align_corners_cpu"
+        "|test_resize_upsample_scales_linear_cpu"
+        "|test_resize_upsample_scales_linear_half_pixel_symmetric_cpu"
+        "|test_resize_upsample_scales_nearest_axes_2_3_cpu"
+        "|test_resize_upsample_scales_nearest_axes_3_2_cpu"
+        "|test_resize_upsample_scales_nearest_cpu"
+        "|test_resize_upsample_sizes_cubic_cpu"
+        "|test_resize_upsample_sizes_nearest_axes_2_3_cpu"
+        "|test_resize_upsample_sizes_nearest_axes_3_2_cpu"
+        "|test_resize_upsample_sizes_nearest_ceil_half_pixel_cpu"
+        "|test_resize_upsample_sizes_nearest_cpu"
+        "|test_resize_upsample_sizes_nearest_floor_align_corners_cpu"
+        "|test_resize_upsample_sizes_nearest_not_larger_cpu"
+        "|test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric_cpu"
+        "|test_qlinearmatmul_2D_uint8_float32_cuda"
+        "|test_qlinearmatmul_2D_int8_float32_cpu"
+        "|test_image_decoder_decode_jpeg_rgb_cpu"
+        "|test_basic_deform_conv_without_padding_cuda"
+        "|test_qlinearmatmul_3D_int8_float16_cuda"
+        "|test_image_decoder_decode_bmp_rgb_cuda"
+        "|test_qlinearmatmul_2D_uint8_float16_cpu"
+        "|test_image_decoder_decode_jpeg2k_rgb_cuda"
+        "|test_image_decoder_decode_jpeg_bgr_cuda"
+        "|test_qlinearmatmul_3D_uint8_float32_cpu"
+        "|test_qlinearmatmul_3D_uint8_float16_cuda"
+        "|test_deform_conv_with_mask_bias_cpu"
+        "|test_qlinearmatmul_2D_int8_float16_cuda"
+        "|test_image_decoder_decode_jpeg_grayscale_cpu"
+        "|test_basic_deform_conv_without_padding_cpu"
+        "|test_qlinearmatmul_3D_int8_float32_cuda"
+        "|test_qlinearmatmul_3D_int8_float16_cpu"
+        "|test_qlinearmatmul_2D_int8_float32_cuda"
+        "|test_deform_conv_with_mask_bias_cuda"
+        "|test_image_decoder_decode_tiff_rgb_cuda"
+        "|test_image_decoder_decode_jpeg2k_rgb_cpu"
+        "|test_image_decoder_decode_jpeg_rgb_cuda"
+        "|test_image_decoder_decode_jpeg_grayscale_cuda"
+        "|test_qlinearmatmul_3D_uint8_float32_cuda"
+        "|test_image_decoder_decode_png_rgb_cpu"
+        "|test_image_decoder_decode_png_rgb_cuda"
+        "|test_image_decoder_decode_bmp_rgb_cpu"
+        "|test_qlinearmatmul_3D_uint8_float16_cpu"
+        "|test_deform_conv_with_multiple_offset_groups_cuda"
+        "|test_image_decoder_decode_webp_rgb_cpu"
+        "|test_basic_deform_conv_with_padding_cpu"
+        "|test_qlinearmatmul_2D_uint8_float16_cuda"
+        "|test_image_decoder_decode_webp_rgb_cuda"
+        "|test_basic_deform_conv_with_padding_cuda"
+        "|test_image_decoder_decode_pnm_rgb_cpu"
+        "|test_qlinearmatmul_3D_int8_float32_cpu"
+        "|test_image_decoder_decode_jpeg_bgr_cpu"
+        "|test_qlinearmatmul_2D_int8_float16_cpu"
+        "|test_image_decoder_decode_pnm_rgb_cuda"
+        "|test_deform_conv_with_multiple_offset_groups_cpu"
+        "|test_qlinearmatmul_2D_uint8_float32_cpu"
+        "|test_image_decoder_decode_tiff_rgb_cpu"
+        "|test_globalmaxpool_cpu"
+        "|test_globalmaxpool_precomputed_cpu"
+        "|test_instancenorm_example_cpu"
+        "|test_instancenorm_epsilon_cpu"
         ")"
     )
 
diff --git a/onnx/version_converter/convert.h b/onnx/version_converter/convert.h
index 189f63d64ac..2d4410d565e 100644
--- a/onnx/version_converter/convert.h
+++ b/onnx/version_converter/convert.h
@@ -671,6 +671,113 @@ class DefaultVersionConverter : public BaseVersionConverter {
     registerAdapter(std::make_unique<TypeRestriction>("Squeeze", OpSetID(21), OpSetID(20), ir10_types_not_in_ir4));
     registerAdapter(std::make_unique<TypeRestriction>("Transpose", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
     registerAdapter(std::make_unique<TypeRestriction>("Unsqueeze", OpSetID(21), OpSetID(20), ir10_types_not_in_ir4));
+
+    /******** 21 -> 22 ********/
+    registerAdapter(std::make_unique<CompatibleAdapter>("EyeLike", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("RandomUniform", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("RandomNormal", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("RandomUniformLike", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("RandomNormalLike", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Multinomial", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Bernoulli", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("ThresholdedRelu", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Selu", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Elu", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Mish", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("HardSigmoid", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("HardSwish", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Softsign", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Softplus", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Sin", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Cos", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Tan", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Asin", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Acos", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Atan", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Sinh", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Cosh", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Asinh", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Acosh", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Atanh", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Round", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Det", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("NegativeLogLikelihoodLoss", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("AveragePool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("MaxPool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("MaxUnpool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("LpPool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("MaxRoiPool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Conv", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("ConvTranspose", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("DeformConv", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("GlobalAveragePool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("GlobalMaxPool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("GlobalLpPool", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("InstanceNormalization", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("LpNormalization", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Dropout", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("RoiAlign", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("RNN", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("GRU", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("LSTM", OpSetID(21), OpSetID(22)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("GridSample", OpSetID(21), OpSetID(22)));
+
+    /******** 22 -> 21 ********/
+    const std::vector<TensorProto_DataType> bfloat16_not_allowed = {TensorProto_DataType_BFLOAT16};
+    registerAdapter(std::make_unique<TypeRestriction>("EyeLike", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("AveragePool", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("MaxPool", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("RandomUniform", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("RandomNormal", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("RandomNormalLike", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("RandomUniformLike", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Multinomial", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Bernoulli", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("ThresholdedRelu", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Selu", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Elu", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Mish", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("HardSigmoid", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("HardSwish", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Softsign", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Softplus", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Sin", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Cos", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Tan", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Asin", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Acos", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Atan", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Sinh", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Cosh", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Asinh", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Acosh", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Atanh", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Round", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Det", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("NegativeLogLikelihoodLoss", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("MaxUnpool", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("LpPool", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("MaxRoiPool", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Conv", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("ConvTranspose", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("DeformConv", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("GlobalAveragePool", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("GlobalLpPool", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("InstanceNormalization", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("LpNormalization", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("Dropout", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("RoiAlign", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("RNN", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("GRU", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("LSTM", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
+    registerAdapter(std::make_unique<TypeRestriction>("GridSample", OpSetID(22), OpSetID(21), bfloat16_not_allowed));
   }
 
   ModelProto convert_version(const ModelProto& mp_in, const OpSetID& initial_version, const OpSetID& target_version)