From d2ac75796cd9e2cdc697946c10539691a092910d Mon Sep 17 00:00:00 2001
From: galagam <ghubaraagam@nvidia.com>
Date: Mon, 8 Jan 2024 15:51:32 +0200
Subject: [PATCH] Add INT4, UINT4 types (#5811)

### Description
- Add INT4 and UINT4 quantized data types
- Support for packing and unpacking int4x2->byte
- Implementation of Operators: Cast, CastLike, DequantizeLinear,
QuantizeLinear
- Type support for non-compute operators Constant, ConstantOfShape,
Identity, Reshape, Shape, Size, If, Loop, Scan, Flatten, Pad, Squeeze,
Unsqueeze, Transpose.

### Motivation and Context
See details in issue #5776

---------

Signed-off-by: Gal Hubara Agam <ghubaraagam@nvidia.com>
Signed-off-by: galagam <ghubaraagam@nvidia.com>
---
 docs/Changelog.md                             | 1269 +++++++++++++++--
 docs/IR.md                                    |    4 +-
 docs/Operators.md                             |  363 +++--
 docs/TestCoverage.md                          |  206 ++-
 docs/docsgen/source/technical/index.md        |    1 +
 docs/docsgen/source/technical/int4.md         |   55 +
 onnx/backend/test/case/node/cast.py           |   73 +-
 .../test/case/node/dequantizelinear.py        |   44 +
 onnx/backend/test/case/node/quantizelinear.py |   72 +-
 .../test_cast_BFLOAT16_to_FLOAT/model.onnx    |  Bin 138 -> 138 bytes
 .../node/test_cast_DOUBLE_to_FLOAT/model.onnx |  Bin 136 -> 136 bytes
 .../test_cast_DOUBLE_to_FLOAT16/model.onnx    |  Bin 138 -> 138 bytes
 .../test_cast_FLOAT16_to_DOUBLE/model.onnx    |  Bin 138 -> 138 bytes
 .../test_cast_FLOAT16_to_FLOAT/model.onnx     |  Bin 137 -> 137 bytes
 .../model.onnx                                |  Bin 144 -> 144 bytes
 .../model.onnx                                |  Bin 146 -> 146 bytes
 .../model.onnx                                |  Bin 142 -> 142 bytes
 .../model.onnx                                |  Bin 146 -> 146 bytes
 .../node/test_cast_FLOAT16_to_INT4/model.onnx |  Bin 0 -> 136 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 83 bytes
 .../test_data_set_0/output_0.pb               |    1 +
 .../test_cast_FLOAT16_to_UINT4/model.onnx     |  Bin 0 -> 137 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 83 bytes
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 28 bytes
 .../model.onnx                                |  Bin 144 -> 144 bytes
 .../model.onnx                                |  Bin 146 -> 146 bytes
 .../model.onnx                                |  Bin 142 -> 142 bytes
 .../model.onnx                                |  Bin 144 -> 144 bytes
 .../model.onnx                                |  Bin 144 -> 144 bytes
 .../model.onnx                                |  Bin 146 -> 146 bytes
 .../test_cast_FLOAT8E5M2_to_FLOAT/model.onnx  |  Bin 140 -> 140 bytes
 .../model.onnx                                |  Bin 142 -> 142 bytes
 .../test_cast_FLOAT_to_BFLOAT16/model.onnx    |  Bin 138 -> 138 bytes
 .../node/test_cast_FLOAT_to_DOUBLE/model.onnx |  Bin 136 -> 136 bytes
 .../test_cast_FLOAT_to_FLOAT16/model.onnx     |  Bin 137 -> 137 bytes
 .../model.onnx                                |  Bin 142 -> 142 bytes
 .../model.onnx                                |  Bin 144 -> 144 bytes
 .../test_cast_FLOAT_to_FLOAT8E5M2/model.onnx  |  Bin 140 -> 140 bytes
 .../model.onnx                                |  Bin 144 -> 144 bytes
 .../node/test_cast_FLOAT_to_INT4/model.onnx   |  Bin 0 -> 134 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 111 bytes
 .../test_data_set_0/output_0.pb               |    1 +
 .../node/test_cast_FLOAT_to_STRING/model.onnx |  Bin 136 -> 136 bytes
 .../node/test_cast_FLOAT_to_UINT4/model.onnx  |  Bin 0 -> 135 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 111 bytes
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 28 bytes
 .../node/test_cast_INT4_to_FLOAT/model.onnx   |  Bin 0 -> 134 bytes
 .../test_data_set_0/input_0.pb                |    1 +
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 111 bytes
 .../node/test_cast_INT4_to_FLOAT16/model.onnx |  Bin 0 -> 136 bytes
 .../test_data_set_0/input_0.pb                |    1 +
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 83 bytes
 .../node/test_cast_INT4_to_INT8/model.onnx    |  Bin 0 -> 133 bytes
 .../test_data_set_0/input_0.pb                |    1 +
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 117 bytes
 .../node/test_cast_STRING_to_FLOAT/model.onnx |  Bin 136 -> 136 bytes
 .../node/test_cast_UINT4_to_FLOAT/model.onnx  |  Bin 0 -> 135 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 28 bytes
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 111 bytes
 .../test_cast_UINT4_to_FLOAT16/model.onnx     |  Bin 0 -> 137 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 28 bytes
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 65 bytes
 .../node/test_cast_UINT4_to_UINT8/model.onnx  |  Bin 0 -> 135 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 28 bytes
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 36 bytes
 .../model.onnx                                |  Bin 174 -> 174 bytes
 .../model.onnx                                |  Bin 176 -> 176 bytes
 .../model.onnx                                |  Bin 172 -> 172 bytes
 .../model.onnx                                |  Bin 176 -> 176 bytes
 .../model.onnx                                |  Bin 172 -> 172 bytes
 .../model.onnx                                |  Bin 174 -> 174 bytes
 .../model.onnx                                |  Bin 170 -> 170 bytes
 .../model.onnx                                |  Bin 174 -> 174 bytes
 .../model.onnx                                |  Bin 162 -> 162 bytes
 .../model.onnx                                |  Bin 191 -> 191 bytes
 .../test_castlike_DOUBLE_to_FLOAT/model.onnx  |  Bin 160 -> 160 bytes
 .../model.onnx                                |  Bin 162 -> 162 bytes
 .../model.onnx                                |  Bin 191 -> 191 bytes
 .../model.onnx                                |  Bin 189 -> 189 bytes
 .../model.onnx                                |  Bin 162 -> 162 bytes
 .../model.onnx                                |  Bin 191 -> 191 bytes
 .../test_castlike_FLOAT16_to_FLOAT/model.onnx |  Bin 161 -> 161 bytes
 .../model.onnx                                |  Bin 190 -> 190 bytes
 .../model.onnx                                |  Bin 168 -> 168 bytes
 .../model.onnx                                |  Bin 197 -> 197 bytes
 .../model.onnx                                |  Bin 166 -> 166 bytes
 .../model.onnx                                |  Bin 195 -> 195 bytes
 .../model.onnx                                |  Bin 168 -> 168 bytes
 .../model.onnx                                |  Bin 197 -> 197 bytes
 .../model.onnx                                |  Bin 164 -> 164 bytes
 .../model.onnx                                |  Bin 193 -> 193 bytes
 .../model.onnx                                |  Bin 162 -> 162 bytes
 .../model.onnx                                |  Bin 191 -> 191 bytes
 .../test_castlike_FLOAT_to_DOUBLE/model.onnx  |  Bin 160 -> 160 bytes
 .../model.onnx                                |  Bin 189 -> 189 bytes
 .../test_castlike_FLOAT_to_FLOAT16/model.onnx |  Bin 161 -> 161 bytes
 .../model.onnx                                |  Bin 190 -> 190 bytes
 .../model.onnx                                |  Bin 166 -> 166 bytes
 .../model.onnx                                |  Bin 168 -> 168 bytes
 .../model.onnx                                |  Bin 197 -> 197 bytes
 .../model.onnx                                |  Bin 195 -> 195 bytes
 .../model.onnx                                |  Bin 164 -> 164 bytes
 .../model.onnx                                |  Bin 168 -> 168 bytes
 .../model.onnx                                |  Bin 197 -> 197 bytes
 .../model.onnx                                |  Bin 193 -> 193 bytes
 .../test_castlike_FLOAT_to_STRING/model.onnx  |  Bin 160 -> 160 bytes
 .../model.onnx                                |  Bin 189 -> 189 bytes
 .../test_castlike_STRING_to_FLOAT/model.onnx  |  Bin 160 -> 160 bytes
 .../model.onnx                                |  Bin 189 -> 189 bytes
 .../test/data/node/test_constant/model.onnx   |  Bin 224 -> 224 bytes
 .../data/node/test_constant_pad/model.onnx    |  Bin 186 -> 186 bytes
 .../node/test_constant_pad_axes/model.onnx    |  Bin 217 -> 217 bytes
 .../model.onnx                                |  Bin 226 -> 226 bytes
 .../model.onnx                                |  Bin 156 -> 156 bytes
 .../model.onnx                                |  Bin 148 -> 148 bytes
 .../test_constantofshape_int_zeros/model.onnx |  Bin 147 -> 147 bytes
 .../test_dequantizelinear_int4/model.onnx     |  Bin 0 -> 196 bytes
 .../test_data_set_0/input_0.pb                |    1 +
 .../test_data_set_0/input_1.pb                |  Bin 0 -> 17 bytes
 .../test_data_set_0/input_2.pb                |    2 +
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 29 bytes
 .../test_dequantizelinear_uint4/model.onnx    |  Bin 0 -> 197 bytes
 .../test_data_set_0/input_0.pb                |    1 +
 .../test_data_set_0/input_1.pb                |  Bin 0 -> 17 bytes
 .../test_data_set_0/input_2.pb                |    2 +
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 29 bytes
 .../test/data/node/test_edge_pad/model.onnx   |  Bin 154 -> 154 bytes
 .../data/node/test_flatten_axis0/model.onnx   |  Bin 124 -> 124 bytes
 .../data/node/test_flatten_axis1/model.onnx   |  Bin 124 -> 124 bytes
 .../data/node/test_flatten_axis2/model.onnx   |  Bin 124 -> 124 bytes
 .../data/node/test_flatten_axis3/model.onnx   |  Bin 124 -> 124 bytes
 .../node/test_flatten_default_axis/model.onnx |  Bin 118 -> 118 bytes
 .../test_flatten_negative_axis1/model.onnx    |  Bin 142 -> 142 bytes
 .../test_flatten_negative_axis2/model.onnx    |  Bin 142 -> 142 bytes
 .../test_flatten_negative_axis3/model.onnx    |  Bin 142 -> 142 bytes
 .../test_flatten_negative_axis4/model.onnx    |  Bin 142 -> 142 bytes
 .../test/data/node/test_identity/model.onnx   |  Bin 115 -> 115 bytes
 .../node/test_identity_sequence/model.onnx    |  Bin 96 -> 96 bytes
 .../test_data_set_0/output_0.pb               |  Bin 2516 -> 2516 bytes
 .../node/test_mvn/test_data_set_0/output_0.pb |    2 +-
 .../test_data_set_0/output_0.pb               |    2 +-
 .../test_data_set_0/output_0.pb               |    2 +-
 .../test_data_set_0/output_0.pb               |  Bin 23 -> 14 bytes
 .../test_data_set_0/output_0.pb               |  Bin 23 -> 14 bytes
 .../node/test_quantizelinear_int4/model.onnx  |  Bin 0 -> 204 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 59 bytes
 .../test_data_set_0/input_1.pb                |  Bin 0 -> 27 bytes
 .../test_data_set_0/input_2.pb                |    2 +
 .../test_data_set_0/output_0.pb               |    1 +
 .../node/test_quantizelinear_uint4/model.onnx |  Bin 0 -> 205 bytes
 .../test_data_set_0/input_0.pb                |  Bin 0 -> 59 bytes
 .../test_data_set_0/input_1.pb                |  Bin 0 -> 27 bytes
 .../test_data_set_0/input_2.pb                |    2 +
 .../test_data_set_0/output_0.pb               |  Bin 0 -> 18 bytes
 .../data/node/test_reflect_pad/model.onnx     |  Bin 160 -> 160 bytes
 .../model.onnx                                |  Bin 192 -> 192 bytes
 .../test_reshape_extended_dims/model.onnx     |  Bin 172 -> 172 bytes
 .../node/test_reshape_negative_dim/model.onnx |  Bin 167 -> 167 bytes
 .../model.onnx                                |  Bin 181 -> 181 bytes
 .../data/node/test_reshape_one_dim/model.onnx |  Bin 154 -> 154 bytes
 .../node/test_reshape_reduced_dims/model.onnx |  Bin 163 -> 163 bytes
 .../model.onnx                                |  Bin 173 -> 173 bytes
 .../model.onnx                                |  Bin 174 -> 174 bytes
 .../model.onnx                                |  Bin 180 -> 180 bytes
 .../node/test_reshape_zero_dim/model.onnx     |  Bin 167 -> 167 bytes
 .../test/data/node/test_shape/model.onnx      |  Bin 93 -> 93 bytes
 .../data/node/test_shape_clip_end/model.onnx  |  Bin 114 -> 114 bytes
 .../node/test_shape_clip_start/model.onnx     |  Bin 127 -> 127 bytes
 .../data/node/test_shape_end_1/model.onnx     |  Bin 111 -> 111 bytes
 .../node/test_shape_end_negative_1/model.onnx |  Bin 129 -> 129 bytes
 .../data/node/test_shape_example/model.onnx   |  Bin 97 -> 97 bytes
 .../data/node/test_shape_start_1/model.onnx   |  Bin 115 -> 115 bytes
 .../node/test_shape_start_1_end_2/model.onnx  |  Bin 133 -> 133 bytes
 .../model.onnx                                |  Bin 151 -> 151 bytes
 .../test_shape_start_negative_1/model.onnx    |  Bin 133 -> 133 bytes
 .../test/data/node/test_size/model.onnx       |  Bin 87 -> 87 bytes
 .../data/node/test_size_example/model.onnx    |  Bin 91 -> 91 bytes
 .../test/data/node/test_squeeze/model.onnx    |  Bin 135 -> 135 bytes
 .../test_squeeze_negative_axes/model.onnx     |  Bin 149 -> 149 bytes
 .../model.onnx                                |  Bin 170 -> 170 bytes
 .../model.onnx                                |  Bin 170 -> 170 bytes
 .../model.onnx                                |  Bin 170 -> 170 bytes
 .../model.onnx                                |  Bin 170 -> 170 bytes
 .../model.onnx                                |  Bin 170 -> 170 bytes
 .../model.onnx                                |  Bin 170 -> 170 bytes
 .../node/test_transpose_default/model.onnx    |  Bin 141 -> 141 bytes
 .../node/test_unsqueeze_axis_0/model.onnx     |  Bin 146 -> 146 bytes
 .../node/test_unsqueeze_axis_1/model.onnx     |  Bin 146 -> 146 bytes
 .../node/test_unsqueeze_axis_2/model.onnx     |  Bin 146 -> 146 bytes
 .../test_unsqueeze_negative_axes/model.onnx   |  Bin 162 -> 162 bytes
 .../node/test_unsqueeze_three_axes/model.onnx |  Bin 159 -> 159 bytes
 .../node/test_unsqueeze_two_axes/model.onnx   |  Bin 153 -> 153 bytes
 .../test_unsqueeze_unsorted_axes/model.onnx   |  Bin 162 -> 162 bytes
 .../test/data/node/test_wrap_pad/model.onnx   |  Bin 154 -> 154 bytes
 onnx/checker.cc                               |    2 +
 onnx/defs/controlflow/defs.cc                 |   32 +-
 onnx/defs/controlflow/old.cc                  |  177 +++
 onnx/defs/data_type_utils.cc                  |    2 +
 onnx/defs/generator/defs.cc                   |   27 +-
 onnx/defs/generator/old.cc                    |  115 ++
 onnx/defs/nn/defs.cc                          |   11 +-
 onnx/defs/nn/old.cc                           |   46 +
 onnx/defs/operator_sets.h                     |   36 +-
 onnx/defs/quantization/defs.cc                |   11 +-
 onnx/defs/schema.h                            |   79 +-
 onnx/defs/tensor/defs.cc                      |  139 +-
 onnx/defs/tensor/old.cc                       |  920 +++++++++++-
 onnx/defs/tensor/utils.cc                     |    9 +-
 onnx/defs/tensor/utils.h                      |    6 +-
 onnx/helper.py                                |   57 +-
 onnx/mapping.py                               |    9 +
 onnx/numpy_helper.py                          |   51 +-
 onnx/onnx-ml.proto                            |    4 +
 onnx/onnx-ml.proto3                           |    4 +
 onnx/onnx.in.proto                            |    4 +
 onnx/onnx.proto                               |    4 +
 onnx/onnx.proto3                              |    4 +
 onnx/reference/custom_element_types.py        |    2 +
 onnx/reference/op_run.py                      |   27 +-
 onnx/reference/ops/op_cast.py                 |   22 +
 onnx/reference/ops/op_cast_like.py            |    6 +
 onnx/reference/ops/op_constant.py             |    4 +
 onnx/reference/ops/op_dequantize_linear.py    |   33 +-
 onnx/reference/ops/op_quantize_linear.py      |   43 +-
 onnx/subbyte.py                               |   72 +
 onnx/test/helper_test.py                      |   47 +
 onnx/test/reference_evaluator_test.py         |  120 +-
 onnx/test/test_backend_onnxruntime.py         |   29 +
 onnx/test/test_backend_reference.py           |    6 +
 onnx/version_converter/convert.h              |   57 +-
 230 files changed, 3936 insertions(+), 390 deletions(-)
 create mode 100644 docs/docsgen/source/technical/int4.md
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_INT8/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_INT8/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_INT4_to_INT8/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_int4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_1.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_2.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_uint4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_1.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_2.pb
 create mode 100644 onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_int4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_1.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_2.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/output_0.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_uint4/model.onnx
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_0.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_1.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_2.pb
 create mode 100644 onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/output_0.pb
 create mode 100644 onnx/subbyte.py

diff --git a/docs/Changelog.md b/docs/Changelog.md
index f10906355d1..64fcdc101fa 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -24534,15 +24534,1080 @@ This version of the operator has been available since version 20 of the default
 </dl>
 
 ## Version 21 of the default ONNX operator set
+### <a name="Cast-21"></a>**Cast-21**</a>
+
+  The operator casts the elements of a given input tensor to a data type
+  specified by the 'to' argument and returns an output tensor of the same size in
+  the converted type. The 'to' argument must be one of the data types specified
+  in the 'DataType' enum field in the TensorProto message.
+
+  Casting from string tensor in plain (e.g., "3.14" and "1000") and scientific numeric representations
+  (e.g., "1e-5" and "1E8") to float types is supported. For example, converting string "100.5" to an integer may
+  yield result 100. There are some string literals reserved for special floating-point values;
+  "+INF" (and "INF"), "-INF", and "NaN" are positive infinity, negative infinity, and not-a-number, respectively.
+  Any string which can exactly match "+INF" in a case-insensitive way would be mapped to positive infinite. Similarly,
+  this case-insensitive rule is applied to "INF" and "NaN". When casting from numeric tensors
+  to string tensors, plain floating-point representation (such as "314.15926") would be used.
+  Converting non-numerical-literal string such as "Hello World!" is an undefined behavior. Cases
+  of converting string representing floating-point arithmetic value, such as "2.718", to INT is an undefined behavior.
+
+  Conversion from a numerical type to any numerical type is always allowed.
+  User must be aware of precision loss and value change caused by range difference between two types.
+  For example, a 64-bit float 3.1415926459 may be round to a 32-bit float 3.141592. Similarly, converting
+  an integer 36 to Boolean may produce 1 because we truncate bits which can't be stored in the targeted type.
+
+  In more detail, the conversion among numerical types should follow these rules
+  if the destination type is not a float 8 type.
+
+  * Casting from floating point to:
+    * floating point: +/- infinity if OOR (out of range).
+    * fixed point: undefined if OOR.
+    * bool: +/- 0.0 to False; all else to True.
+  * Casting from fixed point to:
+    * floating point: +/- infinity if OOR. (+ infinity in the case of uint)
+    * fixed point: when OOR, discard higher bits and reinterpret (with respect to two's complement representation for
+      signed types). For example, 200 (int16) -> -56 (int8).
+    * bool: zero to False; nonzero to True.
+  * Casting from bool to:
+    * floating point: `{1.0, 0.0}`.
+    * fixed point: `{1, 0}`.
+    * bool: no change.
+
+  Float 8 type were introduced to speed up the training of
+  deep models. By default the conversion of a float *x* obeys
+  to the following rules. `[x]` means the value rounded to
+  the target mantissa width.
+
+  | x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
+  |------|----|----|----|----|
+  | 0 | 0 | 0 | 0 | 0 |
+  |-0 | -0 | 0 | -0 | 0 |
+  | NaN | NaN | NaN | NaN | NaN |
+  | +/- Inf | +/- FLT_MAX | NaN | FLT_MAX | NaN |
+  | [x] > FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX |
+  | [x] < -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
+  | else | RNE | RNE | RNE | RNE |
+
+  The behavior changes if the parameter 'saturate' is set to False.
+  The rules then become:
+
+  | x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
+  |------|----|----|----|----|
+  | 0 | 0 | 0 | 0 | 0 |
+  |-0 | -0 | 0 | -0 | 0 |
+  | NaN | NaN | NaN | NaN | NaN |
+  | +/- Inf | NaN | NaN | +/- Inf | NaN |
+  | [x] > FLT_MAX | NaN | NaN | Inf | NaN |
+  | [x] < -FLT_MAX | NaN | NaN | -Inf | NaN |
+  | else | RNE | RNE | RNE | RNE |
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>saturate</tt> : int (default is 1)</dt>
+<dd>The parameter defines how the conversion behaves if an input value is out of range of the destination type. It only applies for float 8 conversion (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. All cases are fully described in two tables inserted in the operator description.</dd>
+<dt><tt>to</tt> : int (required)</dt>
+<dd>The data type to which the elements of the input tensor are cast. Strictly must be one of the types from DataType enum in TensorProto</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T1</dt>
+<dd>Input tensor to be cast.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T2</dt>
+<dd>Output tensor with the same shape as input with type specified by the 'to' argument</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input types. Casting from complex is not supported.</dd>
+<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain output types. Casting to complex is not supported.</dd>
+</dl>
+
+### <a name="CastLike-21"></a>**CastLike-21**</a>
+
+  The operator casts the elements of a given input tensor (the first input) to
+  the same data type as the elements of the second input tensor.
+  See documentation of the Cast operator for further details.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>saturate</tt> : int (default is 1)</dt>
+<dd>The parameter defines how the conversion behaves if an input value is out of range of the destination type. It only applies for float 8 conversion (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. Please refer to operator Cast description for further details.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T1</dt>
+<dd>Input tensor to be cast.</dd>
+<dt><tt>target_type</tt> (non-differentiable) : T2</dt>
+<dd>The (first) input tensor will be cast to produce a tensor of the same type as this (second input) tensor.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T2</dt>
+<dd>Output tensor produced by casting the first input tensor to have the same type as the second input tensor.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input types. Casting from complex is not supported.</dd>
+<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain output types. Casting to complex is not supported.</dd>
+</dl>
+
+### <a name="Constant-21"></a>**Constant-21**</a>
+
+  This operator produces a constant tensor. Exactly one of the provided attributes, either value, sparse_value,
+  or value_* must be specified.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>sparse_value</tt> : sparse_tensor</dt>
+<dd>The value for the elements of the output tensor in sparse format.</dd>
+<dt><tt>value</tt> : tensor</dt>
+<dd>The value for the elements of the output tensor.</dd>
+<dt><tt>value_float</tt> : float</dt>
+<dd>The value for the sole element for the scalar, float32, output tensor.</dd>
+<dt><tt>value_floats</tt> : list of floats</dt>
+<dd>The values for the elements for the 1D, float32, output tensor.</dd>
+<dt><tt>value_int</tt> : int</dt>
+<dd>The value for the sole element for the scalar, int64, output tensor.</dd>
+<dt><tt>value_ints</tt> : list of ints</dt>
+<dd>The values for the elements for the 1D, int64, output tensor.</dd>
+<dt><tt>value_string</tt> : string</dt>
+<dd>The value for the sole element for the scalar, UTF-8 string, output tensor.</dd>
+<dt><tt>value_strings</tt> : list of strings</dt>
+<dd>The values for the elements for the 1D, UTF-8 string, output tensor.</dd>
+</dl>
+
+#### Inputs
+
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>Output tensor containing the same value of the provided tensor.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types.</dd>
+</dl>
+
+### <a name="ConstantOfShape-21"></a>**ConstantOfShape-21**</a>
+
+  Generate a tensor with given value and shape.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>value</tt> : tensor</dt>
+<dd>(Optional) The value of the output elements.Should be a one-element tensor. If not specified, it defaults to a tensor of value 0 and datatype float32</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T1</dt>
+<dd>1D tensor. The shape of the expected output tensor. If empty tensor is given, the output would be a scalar. All values must be >= 0.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T2</dt>
+<dd>Output tensor of shape specified by 'input'.If attribute 'value' is specified, the value and datatype of the output tensor is taken from 'value'.If attribute 'value' is not specified, the value in the output defaults to 0, and the datatype defaults to float32.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(int64)</dt>
+<dd>Constrain input types.</dd>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain output types to be numerics.</dd>
+</dl>
+
 ### <a name="DequantizeLinear-21"></a>**DequantizeLinear-21**</a>
 
-  The linear dequantization operator. It consumes a quantized tensor, a scale, and a zero point to compute the full precision tensor.
-  The dequantization formula is `y = (x - x_zero_point) * x_scale`. `x_scale` and `x_zero_point` must have same shape, and can be either a scalar
-  for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
-  `x_zero_point` and `x` must have same type. `x` and `y` must have same shape. In the case of dequantizing int32,
-  there's no zero point (zero point is supposed to be 0).
-  `zero-point` is usually not used in the case of float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz quantization,
-  but the dequantization formula remains the same for consistency and 'x_scale' still determines the output type.
+  The linear dequantization operator. It consumes a quantized tensor, a scale, and a zero point to compute the full precision tensor.
+  The dequantization formula is `y = (x - x_zero_point) * x_scale`. `x_scale` and `x_zero_point` must have same shape, and can be either a scalar
+  for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
+  `x_zero_point` and `x` must have same type. `x` and `y` must have same shape. In the case of dequantizing int32,
+  there's no zero point (zero point is supposed to be 0).
+  `zero-point` is usually not used in the case of float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz quantization,
+  but the dequantization formula remains the same for consistency and 'x_scale' still determines the output type.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>axis</tt> : int (default is 1)</dt>
+<dd>(Optional) The axis of the dequantizing dimension of the input tensor. Ignored for per-tensor quantization. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).</dd>
+</dl>
+
+#### Inputs (2 - 3)
+
+<dl>
+<dt><tt>x</tt> : T1</dt>
+<dd>N-D quantized input tensor to be de-quantized.</dd>
+<dt><tt>x_scale</tt> : T2</dt>
+<dd>Scale for input 'x'. It can be a scalar, which means a per-tensor/layer dequantization, or a 1-D tensor for per-axis dequantization.</dd>
+<dt><tt>x_zero_point</tt> (optional) : T1</dt>
+<dd>Zero point for input 'x'. Shape must match x_scale. It's optional. Zero point is 0 when it's not specified.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>y</tt> : T2</dt>
+<dd>N-D full precision output tensor. It has same shape as input 'x'.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(int32), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>The type of the inputs 'x_zero_point' and 'x'.</dd>
+<dt><tt>T2</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
+<dd>'x_scale' determines the output type.</dd>
+</dl>
+
+### <a name="Flatten-21"></a>**Flatten-21**</a>
+
+  Flattens the input tensor into a 2D matrix. If input tensor has shape
+  (d_0, d_1, ... d_n) then the output will have shape
+  (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>axis</tt> : int (default is 1)</dt>
+<dd>Indicate up to which input dimensions (exclusive) should be flattened to the outer dimension of the output. The value for axis must be in the range [-r, r], where r is the rank of the input tensor. Negative value means counting dimensions from the back. When axis = 0, the shape of the output tensor is (1, (d_0 X d_1 ... d_n), where the shape of the input tensor is (d_0, d_1, ... d_n). </dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : T</dt>
+<dd>A tensor of rank >= axis.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>A 2D tensor with the contents of the input tensor, with input dimensions up to axis flattened to the outer dimension of the output and remaining input dimensions flattened into the inner dimension of the output.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output to all tensor types up to IRv10.</dd>
+</dl>
+
+### <a name="Identity-21"></a>**Identity-21**</a>
+
+  Identity operator
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> (differentiable) : V</dt>
+<dd>Input tensor</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : V</dt>
+<dd>Tensor to copy input into.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128))</dt>
+<dd>Constrain input and output types to all tensor, sequence, and optional types.</dd>
+</dl>
+
+### <a name="If-21"></a>**If-21**</a>
+
+  If conditional
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>else_branch</tt> : graph (required)</dt>
+<dd>Graph to run if condition is false. Has N outputs: values you wish to be live-out to the enclosing scope. The number of outputs must match the number of outputs in the then_branch.</dd>
+<dt><tt>then_branch</tt> : graph (required)</dt>
+<dd>Graph to run if condition is true. Has N outputs: values you wish to be live-out to the enclosing scope. The number of outputs must match the number of outputs in the else_branch.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>cond</tt> : B</dt>
+<dd>Condition for the if. The tensor must contain a single element.</dd>
+</dl>
+
+#### Outputs (1 - &#8734;)
+
+<dl>
+<dt><tt>outputs</tt> (variadic, heterogeneous) : V</dt>
+<dd>Values that are live-out to the enclosing scope. The return values in the `then_branch` and `else_branch` must be of the same data type. The `then_branch` and `else_branch` may produce tensors with the same element type and different shapes. If corresponding outputs from the then-branch and the else-branch have static shapes S1 and S2, then the shape of the corresponding output variable of the if-node (if present) must be compatible with both S1 and S2 as it represents the union of both possible shapes.For example, if in a model file, the first output of `then_branch` is typed float tensor with shape [2] and the first output of `else_branch` is another float tensor with shape [3], If's first output should have (a) no shape set, or (b) a shape of rank 1 with neither `dim_value` nor `dim_param` set, or (c) a shape of rank 1 with a unique `dim_param`. In contrast, the first output cannot have the shape [2] since [2] and [3] are not compatible.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(bfloat16)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(uint4)), seq(tensor(int4)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(bfloat16))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(bfloat16)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128)), optional(tensor(float8e4m3fn)), optional(tensor(float8e4m3fnuz)), optional(tensor(float8e5m2)), optional(tensor(float8e5m2fnuz)), optional(tensor(uint4)), optional(tensor(int4))</dt>
+<dd>All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv10.</dd>
+<dt><tt>B</tt> : tensor(bool)</dt>
+<dd>Only bool</dd>
+</dl>
+
+### <a name="Loop-21"></a>**Loop-21**</a>
+
+  Generic Looping construct. This loop has multiple termination conditions:
+
+  1) Trip count. Iteration count specified at runtime. Set by
+     specifying the input M. Optional. Set to empty string to omit.
+     Note that a static trip count (specified at graph construction time) can be
+     specified by passing in a constant node for input M.
+  2) Loop termination condition. This is an input to the op that determines
+     whether to run the first iteration and also a loop-carried dependency for
+     the body graph. The body graph must yield a value for the condition variable,
+     whether this input is provided or not.
+
+  This table summarizes the operating modes of this operator with equivalent
+  C-style code:
+
+  Operator inputs defined as (max_trip_count, condition_var).
+
+  * input ("", ""):
+          for (int i=0; ; ++i) {
+            cond = ... // Note this value is ignored, but is required in the body
+          }
+
+  * input ("", cond) // Note this is analogous to a while loop
+          bool cond = ...;
+          for (int i=0; cond; ++i) {
+            cond = ...;
+          }
+
+  * input ("", 1) // Note this is analogous to a do-while loop
+          bool cond = true
+          for (int i=0; cond; ++i) {
+            cond = ...;
+          }
+
+  * input (trip_count, "") // Note this is analogous to a for loop
+          int trip_count = ...
+          for (int i=0; i < trip_count; ++i) {
+            cond = ...; // ignored
+          }
+
+  * input (trip_count, cond)
+          int trip_count = ...;
+          bool cond = ...;
+          for (int i=0; i < trip_count && cond; ++i) {
+            cond = ...;
+          }
+
+
+  *Sample usage - cond as well as trip count*
+
+      graph predict-net {
+        %a = Constant[value = <Scalar Tensor [3]>]()
+        %b = Constant[value = <Scalar Tensor [6]>]()
+        %keepgoing = Constant[value = <Scalar Tensor [1]>]()
+        %max_trip_count = Constant[value = <Scalar Tensor [10]>]()
+        %keepgoing_out, %b_out, %user_defined_vals = Loop[body = <graph body-net>](%max_trip_count, %keepgoing, %b)
+        return
+      }
+
+      graph body-net (
+        %i[INT32, scalar]           // iteration number
+        %keepgoing_in[BOOL, scalar] // incoming loop-termination-condition; not used
+        %b_in[INT32, scalar]        // incoming value of loop-carried-dependency b
+      ) {
+        %my_local = Add(%a, %b_in)
+        %b_out = Sub(%a, %b_in) // outgoing value of loop-carried-dependency b
+        %keepgoing_out = Greater(%my_local, %b_out) // outgoing loop-termination-condition
+        %user_defined_val = Add(%b_in, %b_in) // scan-output value to be accumulated
+        return %keepgoing_out, %b_out, %user_defined_val
+      }
+
+  *Sample equivalent C code*
+
+      {
+        /* User-defined code (enclosing scope) */
+        int a = 3, b = 6;
+        bool keepgoing = true; // Analogous to input cond
+        /* End user-defined code */
+
+        /* Implicitly-defined code */
+        const int max_trip_count = 10; // Analogous to input M
+        int user_defined_vals[]; // Imagine this is resizable
+        /* End implicitly-defined code */
+        /* initialize loop-carried variables and scan-output variables */
+        bool keepgoing_out = keepgoing
+        int b_out = b
+
+        for (int i=0; i < max_trip_count && keepgoing_out; ++i) {
+          /* Implicitly-defined code: bind actual parameter values
+             to formal parameter variables of loop-body */
+          bool keepgoing_in = keepgoing_out;
+          bool b_in = b_out;
+
+          /* User-defined code (loop body) */
+          int my_local = a + b_in; // Reading value "a" from the enclosing scope is fine
+          b_out = a - b_in;
+          keepgoing_out = my_local > b_out;
+          user_defined_val = b_in + b_in; // b_in and b_out are different variables
+          /* End user-defined code */
+
+          /* Implicitly defined-code */
+          user_defined_vals[i] = user_defined_val // accumulate scan-output values
+        }
+        // int t = my_local; // Can't do this. my_local is not accessible here.
+
+        // The values below are bound to the output variables of the loop and therefore accessible
+        // b_out; user_defined_vals; keepgoing_out;
+      }
+
+  There are several things of note in this code snippet:
+
+  1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
+     be referenced in the inputs of the loop.
+  2) Any values computed in the loop body that needs to be used in a subsequent
+     iteration or after the loop are modelled using a pair of variables in the loop-body,
+     consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
+     These are referred to as loop-carried dependences. The loop operation node
+     supplies the input value of the input variable for the first iteration, and
+     returns the output value of the output variable produced by the final
+     iteration.
+  3) Scan_output variables are used to implicitly concatenate values computed across
+     all the iterations. In the above example, the value of user_defined_val computed
+     over all iterations are concatenated and returned as the value of user_defined_vals
+     after the loop.
+  4) Values created in the body cannot be accessed in the enclosing scope,
+     except using the mechanism described above.
+
+  Note that the semantics of this op support "diagonal" or "wavefront" execution.
+  (See Step 3 here for an example:
+  https://devblogs.nvidia.com/optimizing-recurrent-neural-networks-cudnn-5/).
+  Frontends should emit multi-layer RNNs as a series of While operators (with
+  time being the inner looping dimension), with each successive layer consuming
+  the scan_outputs from the previous layer, possibly going through several
+  point-wise operators (e.g. dropout, residual connections, linear layer).
+
+  The input/output of subgraph (produced by loop node) matching is based on order instead of name. The implementation will figure out the names based on this order.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>body</tt> : graph (required)</dt>
+<dd>The graph run each iteration. It has 2+N inputs: (iteration_num, condition, loop carried dependencies...). It has 1+N+K outputs: (condition, loop carried dependencies..., scan_outputs...). Each scan_output is created by concatenating the value of the specified output value at the end of each iteration of the loop. It is an error if the dimensions or data type of these scan_outputs change across loop iterations.</dd>
+</dl>
+
+#### Inputs (2 - &#8734;)
+
+<dl>
+<dt><tt>M</tt> (optional) : I</dt>
+<dd>A maximum trip-count for the loop specified at runtime. Optional. Pass empty string to skip.</dd>
+<dt><tt>cond</tt> (optional) : B</dt>
+<dd>A boolean termination condition. Optional. Pass empty string to skip.</dd>
+<dt><tt>v_initial</tt> (variadic, heterogeneous) : V</dt>
+<dd>The initial values of any loop-carried dependencies (values that change across loop iterations)</dd>
+</dl>
+
+#### Outputs (1 - &#8734;)
+
+<dl>
+<dt><tt>v_final_and_scan_outputs</tt> (variadic, heterogeneous) : V</dt>
+<dd>Final N loop carried dependency values then K scan_outputs. Scan outputs must be Tensors.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(bfloat16)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(uint4)), seq(tensor(int4)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(bfloat16))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(bfloat16)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128)), optional(tensor(float8e4m3fn)), optional(tensor(float8e4m3fnuz)), optional(tensor(float8e5m2)), optional(tensor(float8e5m2fnuz)), optional(tensor(uint4)), optional(tensor(int4))</dt>
+<dd>All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv10.</dd>
+<dt><tt>I</tt> : tensor(int64)</dt>
+<dd>tensor of int64, which should be a scalar.</dd>
+<dt><tt>B</tt> : tensor(bool)</dt>
+<dd>tensor of bool, which should be a scalar.</dd>
+</dl>
+
+### <a name="Pad-21"></a>**Pad-21**</a>
+
+  Given a tensor containing the data to be padded (`data`), a tensor containing the number of start and end pad values for axis (`pads`), (optionally) a `mode`, and (optionally) `constant_value`,
+  a padded tensor (`output`) is generated.
+
+  The three supported `modes` are (similar to corresponding modes supported by `numpy.pad`):
+
+  1) `constant`(default) - pads with a given constant value as specified by `constant_value` (which defaults to 0, empty string, or False)
+
+  2) `reflect` - pads with the reflection of the vector mirrored on the first and last values of the vector along each axis
+
+  3) `edge` - pads with the edge values of array
+
+  4) `wrap` - wrap-around padding as if the data tensor forms a torus
+
+
+  Example 1 (`constant` mode):
+
+  Insert 0 pads to the beginning of the second dimension.
+
+  ```
+  data = [
+      [1.0, 1.2],
+      [2.3, 3.4],
+      [4.5, 5.7],
+  ]
+
+  pads = [0, 2, 0, 0]
+
+  mode = 'constant'
+
+  constant_value = 0.0
+
+  output = [
+      [0.0, 0.0, 1.0, 1.2],
+      [0.0, 0.0, 2.3, 3.4],
+      [0.0, 0.0, 4.5, 5.7],
+  ]
+  ```
+
+  Example 2 (`reflect` mode):
+
+  ```
+  data = [
+      [1.0, 1.2],
+      [2.3, 3.4],
+      [4.5, 5.7],
+  ]
+
+  pads = [0, 2, 0, 0]
+
+  mode = 'reflect'
+
+  output = [
+      [1.0, 1.2, 1.0, 1.2],
+      [2.3, 3.4, 2.3, 3.4],
+      [4.5, 5.7, 4.5, 5.7],
+  ]
+  ```
+
+  Example 3 (`edge` mode):
+
+  ```
+  data = [
+      [1.0, 1.2],
+      [2.3, 3.4],
+      [4.5, 5.7],
+  ]
+
+  pads = [0, 2, 0, 0]
+
+  mode = 'edge'
+
+  output = [
+      [1.0, 1.0, 1.0, 1.2],
+      [2.3, 2.3, 2.3, 3.4],
+      [4.5, 4.5, 4.5, 5.7],
+  ]
+  ```
+
+  Example 4 (`wrap` mode):
+
+  ```
+  data = [
+      [1.0, 1.2],
+      [2.3, 3.4],
+      [4.5, 5.7],
+  ]
+
+  pads = [2, 1, 1, 1]
+
+  mode = 'wrap'
+
+  output = [
+      [3.4, 2.3, 3.4, 2.3],
+      [5.7, 4.5, 5.7, 4.5],
+      [1.2, 1.0, 1.2, 1.0],
+      [3.4, 2.3, 3.4, 2.3],
+      [5.7, 4.5, 5.7, 4.5],
+      [1.2, 1.0, 1.2, 1.0],
+  ]
+  ```
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>mode</tt> : string (default is constant)</dt>
+<dd>Supported modes: `constant`(default), `reflect`, `edge`, `wrap`</dd>
+</dl>
+
+#### Inputs (2 - 4)
+
+<dl>
+<dt><tt>data</tt> (differentiable) : T</dt>
+<dd>Input tensor.</dd>
+<dt><tt>pads</tt> (non-differentiable) : tensor(int64)</dt>
+<dd>Tensor of integers indicating the number of padding elements to add or remove (if negative) at the beginning and end of each axis. For 2D input tensor, it is the number of pixels. `pads` should be a 1D tensor of shape [2 * num_axes] where `num_axes` refers to the number of elements in the `axes` input or the input rank if `axes` are not provided explicitly. `pads` format should be: [x1_begin, x2_begin, ..., x1_end, x2_end,...], where xi_begin is the number of pad values added at the beginning of axis `axes[i]` and xi_end, the number of pad values added at the end of axis `axes[i]`.</dd>
+<dt><tt>constant_value</tt> (optional, non-differentiable) : T</dt>
+<dd>(Optional) A scalar value to be used if the mode chosen is `constant` (by default it is 0, empty string or False).</dd>
+<dt><tt>axes</tt> (optional, non-differentiable) : Tind</dt>
+<dd>1-D tensor of axes that `pads` apply to. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data). Behavior is undefined if an axis is repeated. If not provided, all axes are assumed (`[0, 1, ..., input_rank-1]`).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> (differentiable) : T</dt>
+<dd>Tensor after padding.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types up to IRv10.</dd>
+<dt><tt>Tind</tt> : tensor(int32), tensor(int64)</dt>
+<dd>Constrain indices to integer types</dd>
+</dl>
+
+### <a name="QLinearMatMul-21"></a>**QLinearMatMul-21**</a>
+
+  Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+  It consumes two quantized input tensors, their scales and zero points, scale and zero point of output,
+  and computes the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point).
+  For (x / y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
+  Scale and zero point must have same shape. They must be either scalar (per tensor) or N-D tensor
+  (per row for 'a' and per column for 'b'). Scalar refers to per tensor quantization whereas N-D refers to per row
+  or per column quantization. If the input is 2D of shape [M, K] then zero point and scale tensor may be
+  an M element vector [v_1, v_2, ..., v_M] for per row quantization and K element vector of shape [v_1, v_2, ..., v_K]
+  for per column quantization. If the input is N-D tensor with shape [D1, D2, M, K] then zero point and scale tensor may
+  have shape [D1, D2, M, 1] for per row quantization and shape [D1, D2, 1, K] for per column quantization.
+  Production must never overflow, and accumulation may overflow if and only if in 32 bits.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>a</tt> (non-differentiable) : T1</dt>
+<dd>N-dimensional quantized matrix a</dd>
+<dt><tt>a_scale</tt> (non-differentiable) : TS</dt>
+<dd>scale of quantized input a</dd>
+<dt><tt>a_zero_point</tt> (non-differentiable) : T1</dt>
+<dd>zero point of quantized input a</dd>
+<dt><tt>b</tt> (non-differentiable) : T2</dt>
+<dd>N-dimensional quantized matrix b</dd>
+<dt><tt>b_scale</tt> (non-differentiable) : TS</dt>
+<dd>scale of quantized input b</dd>
+<dt><tt>b_zero_point</tt> (non-differentiable) : T2</dt>
+<dd>zero point of quantized input b</dd>
+<dt><tt>y_scale</tt> (non-differentiable) : TS</dt>
+<dd>scale of quantized output y</dd>
+<dt><tt>y_zero_point</tt> (non-differentiable) : T3</dt>
+<dd>zero point of quantized output y</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>y</tt> (non-differentiable) : T3</dt>
+<dd>Quantized matrix multiply results from a * b</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>TS</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
+<dd>Constrain scales.</dd>
+<dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dd>The type of input a and its zeropoint.</dd>
+<dt><tt>T2</tt> : tensor(int8), tensor(uint8), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dd>The type of input b and its zeropoint.</dd>
+<dt><tt>T3</tt> : tensor(int8), tensor(uint8), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dd>The type of the output and its zeropoint.</dd>
+</dl>
+
+### <a name="QuantizeLinear-21"></a>**QuantizeLinear-21**</a>
+
+  The linear quantization operator. It consumes a high precision tensor, a scale, and a zero point to compute the low precision / quantized tensor.
+  The scale factor and zero point must have same shape, and can be either a scalar for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
+  The quantization formula is `y = saturate ((x / y_scale) + y_zero_point)`.
+  For saturation, it saturates according to:
+  uint8: [0, 255], int8: [-128, 127], uint16: [0, 65535], int16: [-32768, 32767], uint4: [0, 15], int4: [-8, 7]
+  For (x / y_scale), it's rounding to the nearest even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
+  'y_zero_point' and 'y' must have same type.
+  'y_zero_point' is usually not used for quantization to float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz,
+  but the quantization formula remains the same for consistency and
+  the type of the attribute 'y_zero_point' still determines the quantization type.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>axis</tt> : int (default is 1)</dt>
+<dd>(Optional) The axis of the quantization dimension of the input tensor. Ignored for per-tensor quantization. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).</dd>
+<dt><tt>saturate</tt> : int (default is 1)</dt>
+<dd>The parameter defines how the conversion behaves if an input value is out of range of the destination type. It only applies for float 8 quantization (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. All cases are fully described in two tables inserted in the operator description.</dd>
+</dl>
+
+#### Inputs (2 - 3)
+
+<dl>
+<dt><tt>x</tt> : T1</dt>
+<dd>N-D full precision Input tensor to be quantized.</dd>
+<dt><tt>y_scale</tt> : T1</dt>
+<dd>Scale for doing quantization to get 'y'. It can be a scalar, which means per-tensor/layer quantization, or a 1-D Tensor for per-axis quantization.</dd>
+<dt><tt>y_zero_point</tt> (optional) : T2</dt>
+<dd>Zero point for doing quantization to get 'y'. Shape must match y_scale. Default is uint8 with zero point of 0 if it's not specified.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>y</tt> : T2</dt>
+<dd>N-D quantized output tensor. It has same shape as input 'x'.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(float), tensor(float16), tensor(bfloat16), tensor(int32)</dt>
+<dd>The type of the input 'x'.</dd>
+<dt><tt>T2</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>The type of the input 'y_zero_point' and the output 'y'.</dd>
+</dl>
+
+### <a name="Reshape-21"></a>**Reshape-21**</a>
+
+  Reshape the input tensor similar to numpy.reshape.
+  First input is the data tensor, second input is a shape tensor which specifies the output shape. It outputs the reshaped tensor.
+  At most one dimension of the new shape can be -1. In this case, the value is
+  inferred from the size of the tensor and the remaining dimensions. A dimension
+  could also be 0, in which case the actual dimension value is unchanged (i.e. taken
+  from the input tensor). If 'allowzero' is set, and the new shape includes 0, the
+  dimension will be set explicitly to zero (i.e. not taken from input tensor).
+  Shape (second input) could be an empty shape, which means converting to a scalar.
+  The input tensor's shape and the output tensor's shape are required to have the same number of elements.
+
+  If the attribute 'allowzero' is set, it is invalid for the specified shape to
+  contain both a zero value and -1, as the value of the dimension corresponding
+  to -1 cannot be determined uniquely.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>allowzero</tt> : int (default is 0)</dt>
+<dd>(Optional) By default, when any value in the 'shape' input is equal to zero the corresponding dimension value is copied from the input tensor dynamically. allowzero=1 indicates that if any value in the 'shape' input is set to zero, the zero value is honored, similar to NumPy.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>data</tt> (differentiable) : T</dt>
+<dd>An input tensor.</dd>
+<dt><tt>shape</tt> (non-differentiable) : tensor(int64)</dt>
+<dd>Specified shape for output.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>reshaped</tt> (differentiable) : T</dt>
+<dd>Reshaped data.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types.</dd>
+</dl>
+
+### <a name="Scan-21"></a>**Scan-21**</a>
+
+  Scan can be used to iterate over one or more scan_input tensors,
+  constructing zero or more scan_output tensors. It combines ideas from general recurrences,
+  functional programming constructs such as scan, fold, map, and zip, and is intended to enable
+  generalizations of RNN-like constructs for sequence-to-sequence processing.
+  Other tensors (referred to as state_variables here) can be used to carry a state
+  when iterating from one element to another (similar to hidden-state in RNNs, also referred
+  to as loop-carried dependences in the context of loops).
+  Many common usages involve a single scan_input tensor (where functionality
+  similar to scan, fold and map can be obtained). When more than one scan_input is used,
+  a behavior similar to zip is obtained.
+
+  The attribute body must be a graph, specifying the computation to be performed in
+  every iteration. It takes as input the current values of the state_variables and
+  the current iterated element of the scan_inputs. It must return the (updated) values
+  of the state_variables and zero or more scan_output_element tensors. The values of the
+  scan_output_element tensors are concatenated over all the iterations to produce the
+  scan_output values of the scan construct (similar to the concatenated intermediate
+  hidden-state values of RNN-like constructs). All the output tensors (state_variables as
+  well as scan_output_element tensors) are required to have the same shape in each iteration
+  of the loop (a restriction imposed to enable efficient memory allocation).
+
+  Note that the iterated element passed to the body subgraph does not have a sequence
+  axis. It will have a rank one less than the rank of the corresponding scan_input.
+
+  The scan operation returns the final values of the state_variables as well as the
+  scan_outputs.
+
+  The optional attribute scan_input_directions specifies the direction (forward or backward)
+  for each scan input. If this attribute is omitted, all sequences are scanned in the forward
+  direction. A bidirectional scan may be performed by specifying the same tensor input twice
+  in the scan_inputs, once with a forward direction, and once with a backward direction.
+
+  The scan_output of the operation is produced by concatenating the scan_output_element
+  values produced by the body in each iteration.  The optional attribute scan_output_directions
+  specifies the direction in which scan_output is constructed (by appending or prepending the
+  scan_output_element to scan_output in each iteration) for each scan_output. If this attribute
+  is omitted, the scan_output_element is appended to the scan_output in each iteration.
+
+  The optional attribute scan_input_axes specifies the axis to be scanned for each scan_input.
+  If omitted, every scan_input will be scanned in axis 0. For example, if axis 0 is the
+  batch axis and axis 1 is the time axis (to be scanned), specify an axis value of 1.
+  Note that scanning a non-zero axis may be less efficient than scanning axis zero.
+
+  The optional attribute scan_output_axes specifies the axis along which the scan_outputs
+  are accumulated for each scan_output. For example, if axis 1 is the time axis (to be
+  scanned) for both inputs and outputs, specify a scan_input axis and scan_output axis
+  value of 1.
+
+  Note that because of the ONNX restriction that only the last parameter of an operator can
+  be variadic, the initial-states and scan-inputs are listed together as one input parameter.
+  Similarly, the final-states and scan-outputs are listed together as one output parameter.
+  The attribute num_scan_inputs indicates the number M of scan-inputs.
+
+  The behavior of
+
+      Scan <
+          num_scan_inputs = m,
+          body = loop-body,
+          scan_input_axes = [axis_1, ..., axis_m]
+      > (init_1, ..., init_n, scan_1, ..., scan_m)
+
+  is equivalent to the following pseudo-code:
+
+      // scan_i.shape[axis_i] denotes the (max) sequence-length of scan_i
+      // scan_i.shape[axis_i] is required to be equal to scan_j.shape[axis_j] for all i,j.
+      sequence_length = scan_1.shape[axis_1];
+
+      // initialize state-variables
+      st_1 = init_1; ... st_n = init_n;
+      // initialize scan-output variables: [] denotes an empty tensor
+      scan_out_1 = []; ...; scan_out_k = [];
+      // identify number of iterations:
+
+      // execute loop
+      for (int t = 0; t < sequence_length; ++t) {
+          // generate the scan-input elements: the notation T<axis=k>[t] indicates the sub-tensor
+          // of rank one less than T obtained by indexing T at position t along axis k.
+          si_1 = scan_1<axis=axis_1>[t];
+          ... ;
+          si_m = scan_m<axis=axis_m>[t];
+          // execute loop-body
+          st_1, ..., st_n, so_1, ..., so_k = loop-body(st_1, ..., st_n, si_1, ..., si_m)
+          // accumulate the scan-output elements
+          scan_out_1 = Concat<axis=0>(scan_out_1, so_1); ... ; scan_out_k = Concat<axis=0>(scan_out_k, so_k);
+      }
+
+      return st_1, ..., st_n, scan_out_1, ..., scan_out_k;
+
+  *Sample usage: Encoding RNN using a Scan*
+
+  The following example shows how a simple RNN over an input tensor %X, with weight tensor %Wi,
+  recurrence weight tensor %Ri, bias tensors %Wbi and %Rbi, and initial hidden-state %H_0 can
+  be encoded as a ScanLoop. Note that the loop-body is a nested graph, and it directly computes
+  %Wi, %Ri, %Wbi, and %Rbi (typically constants or initializers in the body graph). If these
+  values are computed in the outer graph, they need to be passed in as extra state_variables.
+
+      graph rnn-encoding {
+        %H_0 = ...
+        %X = ...
+        %Y_h, %Y = Scan[body = <graph rnn-cell-1>, num_scan_inputs=1](%H_0, %X)
+        return %Y, %Y_h
+      }
+
+      graph rnn-cell-1 (
+        %H_tminus1[FLOAT, tensor]
+        %X_t[FLOAT, tensor]
+      ) {
+        %Wi = ...
+        %Ri = ...
+        %Wbi = ...
+        %Rbi = ...
+        %t1 = X_t * (Wi^T)
+        %t2 = H_tminus1*(Ri^T)
+        %t3 = Add(%t1, %t2)
+        %t4 = Add(%t3, %Wbi)
+        %t5 = Add(%t4, %Rbi)
+        %Ht = Tanh(%t5)
+        %Accumulate = Identity(%Ht)
+        return %Ht, %Accumulate
+      }
+
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>body</tt> : graph (required)</dt>
+<dd>The graph run each iteration. It has N+M inputs: (loop state variables..., scan_input_elts...). It has N+K outputs: (loop state variables..., scan_output_elts...). Each scan_output is created by concatenating the value of the specified scan_output_elt value at the end of each iteration of the loop. It is an error if the dimensions of these values change across loop iterations.</dd>
+<dt><tt>num_scan_inputs</tt> : int (required)</dt>
+<dd>An attribute specifying the number of scan_inputs M. </dd>
+<dt><tt>scan_input_axes</tt> : list of ints</dt>
+<dd>An optional list of M flags. The i-th element of the list specifies the axis to be scanned (the sequence axis) for the i-th scan_input. If omitted, 0 will be used as the scan axis for every scan_input. Negative value for an axis means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).</dd>
+<dt><tt>scan_input_directions</tt> : list of ints</dt>
+<dd>An optional list of M flags. The i-th element of the list specifies the direction to be scanned for the i-th scan_input tensor: 0 indicates forward direction and 1 indicates reverse direction. If omitted, all scan_input tensors will be scanned in the forward direction.</dd>
+<dt><tt>scan_output_axes</tt> : list of ints</dt>
+<dd>An optional list of K flags. The i-th element of the list specifies the axis for the i-th scan_output. The scan outputs are accumulated along the specified axis. If omitted, 0 will be used as the scan axis for every scan_output. Negative value for an axis means counting dimensions from the back. Accepted range is [-r, r-1].</dd>
+<dt><tt>scan_output_directions</tt> : list of ints</dt>
+<dd>An optional list of K flags, one for each scan_output. The i-th element of the list specifies whether the i-th scan_output should be constructed by appending or prepending a new value in each iteration: 0 indicates appending and 1 indicates prepending. If omitted, all scan_output tensors will be produced by appending a value in each iteration.</dd>
+</dl>
+
+#### Inputs (1 - &#8734;)
+
+<dl>
+<dt><tt>initial_state_and_scan_inputs</tt> (variadic, heterogeneous) : V</dt>
+<dd>Initial values of the loop's N state variables followed by M scan_inputs</dd>
+</dl>
+
+#### Outputs (1 - &#8734;)
+
+<dl>
+<dt><tt>final_state_and_scan_outputs</tt> (variadic, heterogeneous) : V</dt>
+<dd>Final values of the loop's N state variables followed by K scan_outputs</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>All Tensor types up to IRv10.</dd>
+</dl>
+
+### <a name="Shape-21"></a>**Shape-21**</a>
+
+  Takes a tensor as input and outputs an 1D int64 tensor containing the shape of the input tensor.
+  Optional attributes start and end can be used to compute a slice of the input tensor's shape.
+  If start axis is omitted, the slice starts from axis 0.
+  The end axis, if specified, is exclusive (and the returned value will not include the size of that axis).
+  If the end axis is omitted, the axes upto the last one will be included.
+  Negative axes indicate counting back from the last axis.
+  Note that axes will be clamped to the range [0, r-1], where r is the
+  rank of the input tensor if they are out-of-range (after adding r in the case of
+  negative axis). Thus, specifying any end value > r is equivalent to specifying an end
+  value of r, and specifying any start value < -r is equivalent to specifying a start
+  value of 0.
+
+  Examples:
+
+  ```
+  Input tensor with shape: [2, 3, 4]
+  No attributes specified.
+  Output: [2, 3, 4]
+  ```
+
+  ```
+  Input tensor with shape: [2, 3, 4]
+  start: -1
+  Output: [4]
+  ```
+
+  ```
+  Input tensor with shape: [2, 3, 4]
+  end: -1
+  Output: [2, 3]
+  ```
+
+  ```
+  Input tensor with shape: [2, 3, 4]
+  start: 1
+  end: 2
+  Output: [3]
+  ```
 
 #### Version
 
@@ -24551,50 +25616,38 @@ This version of the operator has been available since version 21 of the default
 #### Attributes
 
 <dl>
-<dt><tt>axis</tt> : int (default is 1)</dt>
-<dd>(Optional) The axis of the dequantizing dimension of the input tensor. Ignored for per-tensor quantization. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).</dd>
+<dt><tt>end</tt> : int</dt>
+<dd>(Optional) Ending axis for slicing the shape. Negative value means counting dimensions from the back. If omitted, sizes of all axes upto (including) the last one will be included.</dd>
+<dt><tt>start</tt> : int (default is 0)</dt>
+<dd>(Optional) Starting axis for slicing the shape. Default value is 0.Negative value means counting dimensions from the back.</dd>
 </dl>
 
-#### Inputs (2 - 3)
+#### Inputs
 
 <dl>
-<dt><tt>x</tt> : T1</dt>
-<dd>N-D quantized input tensor to be de-quantized.</dd>
-<dt><tt>x_scale</tt> : T2</dt>
-<dd>Scale for input 'x'. It can be a scalar, which means a per-tensor/layer dequantization, or a 1-D tensor for per-axis dequantization.</dd>
-<dt><tt>x_zero_point</tt> (optional) : T1</dt>
-<dd>Zero point for input 'x'. Shape must match x_scale. It's optional. Zero point is 0 when it's not specified.</dd>
+<dt><tt>data</tt> (non-differentiable) : T</dt>
+<dd>An input tensor.</dd>
 </dl>
 
 #### Outputs
 
 <dl>
-<dt><tt>y</tt> : T2</dt>
-<dd>N-D full precision output tensor. It has same shape as input 'x'.</dd>
+<dt><tt>shape</tt> (non-differentiable) : T1</dt>
+<dd>Shape of the input tensor</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(int32), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>The type of the inputs 'x_zero_point' and 'x'.</dd>
-<dt><tt>T2</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
-<dd>'x_scale' determines the output type.</dd>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Input tensor can be of arbitrary type.</dd>
+<dt><tt>T1</tt> : tensor(int64)</dt>
+<dd>Constrain output to int64 tensor.</dd>
 </dl>
 
-### <a name="QLinearMatMul-21"></a>**QLinearMatMul-21**</a>
+### <a name="Size-21"></a>**Size-21**</a>
 
-  Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
-  It consumes two quantized input tensors, their scales and zero points, scale and zero point of output,
-  and computes the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point).
-  For (x / y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
-  Scale and zero point must have same shape. They must be either scalar (per tensor) or N-D tensor
-  (per row for 'a' and per column for 'b'). Scalar refers to per tensor quantization whereas N-D refers to per row
-  or per column quantization. If the input is 2D of shape [M, K] then zero point and scale tensor may be
-  an M element vector [v_1, v_2, ..., v_M] for per row quantization and K element vector of shape [v_1, v_2, ..., v_K]
-  for per column quantization. If the input is N-D tensor with shape [D1, D2, M, K] then zero point and scale tensor may
-  have shape [D1, D2, M, 1] for per row quantization and shape [D1, D2, 1, K] for per column quantization.
-  Production must never overflow, and accumulation may overflow if and only if in 32 bits.
+  Takes a tensor as input and outputs a int64 scalar that equals to the total number of elements of the input tensor.
 
 #### Version
 
@@ -24603,55 +25656,65 @@ This version of the operator has been available since version 21 of the default
 #### Inputs
 
 <dl>
-<dt><tt>a</tt> (non-differentiable) : T1</dt>
-<dd>N-dimensional quantized matrix a</dd>
-<dt><tt>a_scale</tt> (non-differentiable) : TS</dt>
-<dd>scale of quantized input a</dd>
-<dt><tt>a_zero_point</tt> (non-differentiable) : T1</dt>
-<dd>zero point of quantized input a</dd>
-<dt><tt>b</tt> (non-differentiable) : T2</dt>
-<dd>N-dimensional quantized matrix b</dd>
-<dt><tt>b_scale</tt> (non-differentiable) : TS</dt>
-<dd>scale of quantized input b</dd>
-<dt><tt>b_zero_point</tt> (non-differentiable) : T2</dt>
-<dd>zero point of quantized input b</dd>
-<dt><tt>y_scale</tt> (non-differentiable) : TS</dt>
-<dd>scale of quantized output y</dd>
-<dt><tt>y_zero_point</tt> (non-differentiable) : T3</dt>
-<dd>zero point of quantized output y</dd>
+<dt><tt>data</tt> (non-differentiable) : T</dt>
+<dd>An input tensor.</dd>
 </dl>
 
 #### Outputs
 
 <dl>
-<dt><tt>y</tt> (non-differentiable) : T3</dt>
-<dd>Quantized matrix multiply results from a * b</dd>
+<dt><tt>size</tt> (non-differentiable) : T1</dt>
+<dd>Total number of elements of the input tensor</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>TS</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
-<dd>Constrain scales.</dd>
-<dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>The type of input a and its zeropoint.</dd>
-<dt><tt>T2</tt> : tensor(int8), tensor(uint8), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>The type of input b and its zeropoint.</dd>
-<dt><tt>T3</tt> : tensor(int8), tensor(uint8), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>The type of the output and its zeropoint.</dd>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Input tensor can be of arbitrary type.</dd>
+<dt><tt>T1</tt> : tensor(int64)</dt>
+<dd>Constrain output to int64 tensor, which should be a scalar though.</dd>
 </dl>
 
-### <a name="QuantizeLinear-21"></a>**QuantizeLinear-21**</a>
+### <a name="Squeeze-21"></a>**Squeeze-21**</a>
 
-  The linear quantization operator. It consumes a high precision tensor, a scale, and a zero point to compute the low precision / quantized tensor.
-  The scale factor and zero point must have same shape, and can be either a scalar for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
-  The quantization formula is `y = saturate ((x / y_scale) + y_zero_point)`.
-  For saturation, it saturates to [0, 255] if it's uint8, [-128, 127] if it's int8, [0, 65535] if it's uint16, or [-32768, 32767] if it's int16.
-  For (x / y_scale), it's rounding to the nearest even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
-  'y_zero_point' and 'y' must have same type.
-  'y_zero_point' is usually not used for quantization to float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz,
-  but the quantization formula remains the same for consistency and
-  the type of the attribute 'y_zero_point' still determines the quantization type.
+  Remove single-dimensional entries from the shape of a tensor.
+  Takes an input `axes` with a list of axes to squeeze.
+  If `axes` is not provided, all the single dimensions will be removed from
+  the shape. If an axis is selected with shape entry not equal to one, an error is raised.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Inputs (1 - 2)
+
+<dl>
+<dt><tt>data</tt> (differentiable) : T</dt>
+<dd>Tensors with at least max(dims) dimensions.</dd>
+<dt><tt>axes</tt> (optional, non-differentiable) : tensor(int64)</dt>
+<dd>List of integers indicating the dimensions to squeeze. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>squeezed</tt> (differentiable) : T</dt>
+<dd>Reshaped tensor with same data as input.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types up to IRv10.</dd>
+</dl>
+
+### <a name="Transpose-21"></a>**Transpose-21**</a>
+
+  Transpose the input tensor similar to numpy.transpose. For example, when
+  perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
+  will be (2, 1, 3).
 
 #### Version
 
@@ -24660,37 +25723,69 @@ This version of the operator has been available since version 21 of the default
 #### Attributes
 
 <dl>
-<dt><tt>axis</tt> : int (default is 1)</dt>
-<dd>(Optional) The axis of the quantization dimension of the input tensor. Ignored for per-tensor quantization. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).</dd>
-<dt><tt>saturate</tt> : int (default is 1)</dt>
-<dd>The parameter defines how the conversion behaves if an input value is out of range of the destination type. It only applies for float 8 quantization (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. All cases are fully described in two tables inserted in the operator description.</dd>
+<dt><tt>perm</tt> : list of ints</dt>
+<dd>A list of integers. By default, reverse the dimensions, otherwise permute the axes according to the values given.</dd>
 </dl>
 
-#### Inputs (2 - 3)
+#### Inputs
 
 <dl>
-<dt><tt>x</tt> : T1</dt>
-<dd>N-D full precision Input tensor to be quantized.</dd>
-<dt><tt>y_scale</tt> : T1</dt>
-<dd>Scale for doing quantization to get 'y'. It can be a scalar, which means per-tensor/layer quantization, or a 1-D Tensor for per-axis quantization.</dd>
-<dt><tt>y_zero_point</tt> (optional) : T2</dt>
-<dd>Zero point for doing quantization to get 'y'. Shape must match y_scale. Default is uint8 with zero point of 0 if it's not specified.</dd>
+<dt><tt>data</tt> (differentiable) : T</dt>
+<dd>An input tensor.</dd>
 </dl>
 
 #### Outputs
 
 <dl>
-<dt><tt>y</tt> : T2</dt>
-<dd>N-D quantized output tensor. It has same shape as input 'x'.</dd>
+<dt><tt>transposed</tt> (differentiable) : T</dt>
+<dd>Transposed output.</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float), tensor(float16), tensor(bfloat16), tensor(int32)</dt>
-<dd>The type of the input 'x'.</dd>
-<dt><tt>T2</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>The type of the input 'y_zero_point' and the output 'y'.</dd>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types.</dd>
+</dl>
+
+### <a name="Unsqueeze-21"></a>**Unsqueeze-21**</a>
+
+  Insert single-dimensional entries to the shape of an input tensor (`data`).
+  Takes one required input `axes` - which contains a list of dimension indices and this operator will insert a dimension of value `1` into the corresponding index of the output tensor (`expanded`).
+
+  For example, given an input tensor (`data`) of shape [3, 4, 5], then
+  Unsqueeze(data, axes=[0, 4]) outputs a tensor (`expanded`) containing same data as `data` but with shape [1, 3, 4, 5, 1].
+
+  The input `axes` should not contain any duplicate entries. It is an error if it contains duplicates.
+  The rank of the output tensor (`output_rank`) is the rank of the input tensor (`data`) plus the number of values in `axes`.
+  Each value in `axes` should be within the (inclusive) range [-output_rank , output_rank - 1].
+  The order of values in `axes` does not matter and can come in any order.
+
+#### Version
+
+This version of the operator has been available since version 21 of the default ONNX operator set.
+
+#### Inputs
+
+<dl>
+<dt><tt>data</tt> (differentiable) : T</dt>
+<dd>Original tensor</dd>
+<dt><tt>axes</tt> (non-differentiable) : tensor(int64)</dt>
+<dd>List of integers indicating the dimensions to be inserted. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(expanded).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>expanded</tt> (differentiable) : T</dt>
+<dd>Reshaped tensor with same data as input.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types up to IRv10.</dd>
 </dl>
 
 # ai.onnx.preview.training
diff --git a/docs/IR.md b/docs/IR.md
index 73072815927..3255f86549e 100644
--- a/docs/IR.md
+++ b/docs/IR.md
@@ -411,8 +411,8 @@ It is common to represent a tensor as a nested list. This generally works fine,
 |Group|Types|Description|
 |---|---|---|
 Floating Point Types|float16, float32, float64, bfloat16, float8e4m3fn, float8e5m2, float8e4m3fnuz, float8e5m2fnuz|Values adhering to the IEEE 754-2008 standard representation of floating-point data or defined in papers [FP8 Formats for Deep Learning](https://arxiv.org/abs/2209.05433) and [8-bit Numerical Formats for Deep Neural Networks](https://arxiv.org/abs/2206.02915)
-Signed Integer Types|int8, int16, int32, int64|Signed integers are supported for 8-64 bit widths.
-Unsigned Integer Types|uint8, uint16, uint32, uint64|Unsigned integers are supported for 8-64 bit widths.
+Signed Integer Types|int4, int8, int16, int32, int64|Signed integers are supported for 4-64 bit widths.
+Unsigned Integer Types|uint4, uint8, uint16, uint32, uint64|Unsigned integers are supported for 4-64 bit widths.
 Complex Types|complex64, complex128|A complex number with either 32- or 64-bit real and imaginary parts.
 Other|string|Strings represent textual data. All strings are encoded using UTF-8.
 Other|bool|Boolean values represent data with only two values, typically true and false.
diff --git a/docs/Operators.md b/docs/Operators.md
index 927d859c652..2910bc361bf 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -29,14 +29,14 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#BitwiseNot">BitwiseNot</a>|<a href="Changelog.md#BitwiseNot-18">18</a>|
 |<a href="#BitwiseOr">BitwiseOr</a>|<a href="Changelog.md#BitwiseOr-18">18</a>|
 |<a href="#BitwiseXor">BitwiseXor</a>|<a href="Changelog.md#BitwiseXor-18">18</a>|
-|<a href="#Cast">Cast</a>|<a href="Changelog.md#Cast-19">19</a>, <a href="Changelog.md#Cast-13">13</a>, <a href="Changelog.md#Cast-9">9</a>, <a href="Changelog.md#Cast-6">6</a>, <a href="Changelog.md#Cast-1">1</a>|
+|<a href="#Cast">Cast</a>|<a href="Changelog.md#Cast-21">21</a>, <a href="Changelog.md#Cast-19">19</a>, <a href="Changelog.md#Cast-13">13</a>, <a href="Changelog.md#Cast-9">9</a>, <a href="Changelog.md#Cast-6">6</a>, <a href="Changelog.md#Cast-1">1</a>|
 |<a href="#Ceil">Ceil</a>|<a href="Changelog.md#Ceil-13">13</a>, <a href="Changelog.md#Ceil-6">6</a>, <a href="Changelog.md#Ceil-1">1</a>|
 |<a href="#Col2Im">Col2Im</a>|<a href="Changelog.md#Col2Im-18">18</a>|
 |<a href="#Compress">Compress</a>|<a href="Changelog.md#Compress-11">11</a>, <a href="Changelog.md#Compress-9">9</a>|
 |<a href="#Concat">Concat</a>|<a href="Changelog.md#Concat-13">13</a>, <a href="Changelog.md#Concat-11">11</a>, <a href="Changelog.md#Concat-4">4</a>, <a href="Changelog.md#Concat-1">1</a>|
 |<a href="#ConcatFromSequence">ConcatFromSequence</a>|<a href="Changelog.md#ConcatFromSequence-11">11</a>|
-|<a href="#Constant">Constant</a>|<a href="Changelog.md#Constant-19">19</a>, <a href="Changelog.md#Constant-13">13</a>, <a href="Changelog.md#Constant-12">12</a>, <a href="Changelog.md#Constant-11">11</a>, <a href="Changelog.md#Constant-9">9</a>, <a href="Changelog.md#Constant-1">1</a>|
-|<a href="#ConstantOfShape">ConstantOfShape</a>|<a href="Changelog.md#ConstantOfShape-20">20</a>, <a href="Changelog.md#ConstantOfShape-9">9</a>|
+|<a href="#Constant">Constant</a>|<a href="Changelog.md#Constant-21">21</a>, <a href="Changelog.md#Constant-19">19</a>, <a href="Changelog.md#Constant-13">13</a>, <a href="Changelog.md#Constant-12">12</a>, <a href="Changelog.md#Constant-11">11</a>, <a href="Changelog.md#Constant-9">9</a>, <a href="Changelog.md#Constant-1">1</a>|
+|<a href="#ConstantOfShape">ConstantOfShape</a>|<a href="Changelog.md#ConstantOfShape-21">21</a>, <a href="Changelog.md#ConstantOfShape-20">20</a>, <a href="Changelog.md#ConstantOfShape-9">9</a>|
 |<a href="#Conv">Conv</a>|<a href="Changelog.md#Conv-11">11</a>, <a href="Changelog.md#Conv-1">1</a>|
 |<a href="#ConvInteger">ConvInteger</a>|<a href="Changelog.md#ConvInteger-10">10</a>|
 |<a href="#ConvTranspose">ConvTranspose</a>|<a href="Changelog.md#ConvTranspose-11">11</a>, <a href="Changelog.md#ConvTranspose-1">1</a>|
@@ -56,7 +56,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Exp">Exp</a>|<a href="Changelog.md#Exp-13">13</a>, <a href="Changelog.md#Exp-6">6</a>, <a href="Changelog.md#Exp-1">1</a>|
 |<a href="#Expand">Expand</a>|<a href="Changelog.md#Expand-13">13</a>, <a href="Changelog.md#Expand-8">8</a>|
 |<a href="#EyeLike">EyeLike</a>|<a href="Changelog.md#EyeLike-9">9</a>|
-|<a href="#Flatten">Flatten</a>|<a href="Changelog.md#Flatten-13">13</a>, <a href="Changelog.md#Flatten-11">11</a>, <a href="Changelog.md#Flatten-9">9</a>, <a href="Changelog.md#Flatten-1">1</a>|
+|<a href="#Flatten">Flatten</a>|<a href="Changelog.md#Flatten-21">21</a>, <a href="Changelog.md#Flatten-13">13</a>, <a href="Changelog.md#Flatten-11">11</a>, <a href="Changelog.md#Flatten-9">9</a>, <a href="Changelog.md#Flatten-1">1</a>|
 |<a href="#Floor">Floor</a>|<a href="Changelog.md#Floor-13">13</a>, <a href="Changelog.md#Floor-6">6</a>, <a href="Changelog.md#Floor-1">1</a>|
 |<a href="#GRU">GRU</a>|<a href="Changelog.md#GRU-14">14</a>, <a href="Changelog.md#GRU-7">7</a>, <a href="Changelog.md#GRU-3">3</a>, <a href="Changelog.md#GRU-1">1</a>|
 |<a href="#Gather">Gather</a>|<a href="Changelog.md#Gather-13">13</a>, <a href="Changelog.md#Gather-11">11</a>, <a href="Changelog.md#Gather-1">1</a>|
@@ -69,8 +69,8 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#Greater">Greater</a>|<a href="Changelog.md#Greater-13">13</a>, <a href="Changelog.md#Greater-9">9</a>, <a href="Changelog.md#Greater-7">7</a>, <a href="Changelog.md#Greater-1">1</a>|
 |<a href="#GridSample">GridSample</a>|<a href="Changelog.md#GridSample-20">20</a>, <a href="Changelog.md#GridSample-16">16</a>|
 |<a href="#Hardmax">Hardmax</a>|<a href="Changelog.md#Hardmax-13">13</a>, <a href="Changelog.md#Hardmax-11">11</a>, <a href="Changelog.md#Hardmax-1">1</a>|
-|<a href="#Identity">Identity</a>|<a href="Changelog.md#Identity-19">19</a>, <a href="Changelog.md#Identity-16">16</a>, <a href="Changelog.md#Identity-14">14</a>, <a href="Changelog.md#Identity-13">13</a>, <a href="Changelog.md#Identity-1">1</a>|
-|<a href="#If">If</a>|<a href="Changelog.md#If-19">19</a>, <a href="Changelog.md#If-16">16</a>, <a href="Changelog.md#If-13">13</a>, <a href="Changelog.md#If-11">11</a>, <a href="Changelog.md#If-1">1</a>|
+|<a href="#Identity">Identity</a>|<a href="Changelog.md#Identity-21">21</a>, <a href="Changelog.md#Identity-19">19</a>, <a href="Changelog.md#Identity-16">16</a>, <a href="Changelog.md#Identity-14">14</a>, <a href="Changelog.md#Identity-13">13</a>, <a href="Changelog.md#Identity-1">1</a>|
+|<a href="#If">If</a>|<a href="Changelog.md#If-21">21</a>, <a href="Changelog.md#If-19">19</a>, <a href="Changelog.md#If-16">16</a>, <a href="Changelog.md#If-13">13</a>, <a href="Changelog.md#If-11">11</a>, <a href="Changelog.md#If-1">1</a>|
 |<a href="#ImageDecoder">ImageDecoder</a>|<a href="Changelog.md#ImageDecoder-20">20</a>|
 |<a href="#InstanceNormalization">InstanceNormalization</a>|<a href="Changelog.md#InstanceNormalization-6">6</a>, <a href="Changelog.md#InstanceNormalization-1">1</a>|
 |<a href="#IsInf">IsInf</a>|<a href="Changelog.md#IsInf-20">20</a>, <a href="Changelog.md#IsInf-10">10</a>|
@@ -79,7 +79,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#LSTM">LSTM</a>|<a href="Changelog.md#LSTM-14">14</a>, <a href="Changelog.md#LSTM-7">7</a>, <a href="Changelog.md#LSTM-1">1</a>|
 |<a href="#Less">Less</a>|<a href="Changelog.md#Less-13">13</a>, <a href="Changelog.md#Less-9">9</a>, <a href="Changelog.md#Less-7">7</a>, <a href="Changelog.md#Less-1">1</a>|
 |<a href="#Log">Log</a>|<a href="Changelog.md#Log-13">13</a>, <a href="Changelog.md#Log-6">6</a>, <a href="Changelog.md#Log-1">1</a>|
-|<a href="#Loop">Loop</a>|<a href="Changelog.md#Loop-19">19</a>, <a href="Changelog.md#Loop-16">16</a>, <a href="Changelog.md#Loop-13">13</a>, <a href="Changelog.md#Loop-11">11</a>, <a href="Changelog.md#Loop-1">1</a>|
+|<a href="#Loop">Loop</a>|<a href="Changelog.md#Loop-21">21</a>, <a href="Changelog.md#Loop-19">19</a>, <a href="Changelog.md#Loop-16">16</a>, <a href="Changelog.md#Loop-13">13</a>, <a href="Changelog.md#Loop-11">11</a>, <a href="Changelog.md#Loop-1">1</a>|
 |<a href="#LpNormalization">LpNormalization</a>|<a href="Changelog.md#LpNormalization-1">1</a>|
 |<a href="#LpPool">LpPool</a>|<a href="Changelog.md#LpPool-18">18</a>, <a href="Changelog.md#LpPool-11">11</a>, <a href="Changelog.md#LpPool-2">2</a>, <a href="Changelog.md#LpPool-1">1</a>|
 |<a href="#MatMul">MatMul</a>|<a href="Changelog.md#MatMul-13">13</a>, <a href="Changelog.md#MatMul-9">9</a>, <a href="Changelog.md#MatMul-1">1</a>|
@@ -103,7 +103,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#OptionalGetElement">OptionalGetElement</a>|<a href="Changelog.md#OptionalGetElement-18">18</a>, <a href="Changelog.md#OptionalGetElement-15">15</a>|
 |<a href="#OptionalHasElement">OptionalHasElement</a>|<a href="Changelog.md#OptionalHasElement-18">18</a>, <a href="Changelog.md#OptionalHasElement-15">15</a>|
 |<a href="#Or">Or</a>|<a href="Changelog.md#Or-7">7</a>, <a href="Changelog.md#Or-1">1</a>|
-|<a href="#Pad">Pad</a>|<a href="Changelog.md#Pad-19">19</a>, <a href="Changelog.md#Pad-18">18</a>, <a href="Changelog.md#Pad-13">13</a>, <a href="Changelog.md#Pad-11">11</a>, <a href="Changelog.md#Pad-2">2</a>, <a href="Changelog.md#Pad-1">1</a>|
+|<a href="#Pad">Pad</a>|<a href="Changelog.md#Pad-21">21</a>, <a href="Changelog.md#Pad-19">19</a>, <a href="Changelog.md#Pad-18">18</a>, <a href="Changelog.md#Pad-13">13</a>, <a href="Changelog.md#Pad-11">11</a>, <a href="Changelog.md#Pad-2">2</a>, <a href="Changelog.md#Pad-1">1</a>|
 |<a href="#Pow">Pow</a>|<a href="Changelog.md#Pow-15">15</a>, <a href="Changelog.md#Pow-13">13</a>, <a href="Changelog.md#Pow-12">12</a>, <a href="Changelog.md#Pow-7">7</a>, <a href="Changelog.md#Pow-1">1</a>|
 |<a href="#QLinearConv">QLinearConv</a>|<a href="Changelog.md#QLinearConv-10">10</a>|
 |<a href="#QLinearMatMul">QLinearMatMul</a>|<a href="Changelog.md#QLinearMatMul-21">21</a>, <a href="Changelog.md#QLinearMatMul-10">10</a>|
@@ -120,13 +120,13 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#ReduceProd">ReduceProd</a>|<a href="Changelog.md#ReduceProd-18">18</a>, <a href="Changelog.md#ReduceProd-13">13</a>, <a href="Changelog.md#ReduceProd-11">11</a>, <a href="Changelog.md#ReduceProd-1">1</a>|
 |<a href="#ReduceSum">ReduceSum</a>|<a href="Changelog.md#ReduceSum-13">13</a>, <a href="Changelog.md#ReduceSum-11">11</a>, <a href="Changelog.md#ReduceSum-1">1</a>|
 |<a href="#RegexFullMatch">RegexFullMatch</a>|<a href="Changelog.md#RegexFullMatch-20">20</a>|
-|<a href="#Reshape">Reshape</a>|<a href="Changelog.md#Reshape-19">19</a>, <a href="Changelog.md#Reshape-14">14</a>, <a href="Changelog.md#Reshape-13">13</a>, <a href="Changelog.md#Reshape-5">5</a>, <a href="Changelog.md#Reshape-1">1</a>|
+|<a href="#Reshape">Reshape</a>|<a href="Changelog.md#Reshape-21">21</a>, <a href="Changelog.md#Reshape-19">19</a>, <a href="Changelog.md#Reshape-14">14</a>, <a href="Changelog.md#Reshape-13">13</a>, <a href="Changelog.md#Reshape-5">5</a>, <a href="Changelog.md#Reshape-1">1</a>|
 |<a href="#Resize">Resize</a>|<a href="Changelog.md#Resize-19">19</a>, <a href="Changelog.md#Resize-18">18</a>, <a href="Changelog.md#Resize-13">13</a>, <a href="Changelog.md#Resize-11">11</a>, <a href="Changelog.md#Resize-10">10</a>|
 |<a href="#ReverseSequence">ReverseSequence</a>|<a href="Changelog.md#ReverseSequence-10">10</a>|
 |<a href="#RoiAlign">RoiAlign</a>|<a href="Changelog.md#RoiAlign-16">16</a>, <a href="Changelog.md#RoiAlign-10">10</a>|
 |<a href="#Round">Round</a>|<a href="Changelog.md#Round-11">11</a>|
 |<a href="#STFT">STFT</a>|<a href="Changelog.md#STFT-17">17</a>|
-|<a href="#Scan">Scan</a>|<a href="Changelog.md#Scan-19">19</a>, <a href="Changelog.md#Scan-16">16</a>, <a href="Changelog.md#Scan-11">11</a>, <a href="Changelog.md#Scan-9">9</a>, <a href="Changelog.md#Scan-8">8</a>|
+|<a href="#Scan">Scan</a>|<a href="Changelog.md#Scan-21">21</a>, <a href="Changelog.md#Scan-19">19</a>, <a href="Changelog.md#Scan-16">16</a>, <a href="Changelog.md#Scan-11">11</a>, <a href="Changelog.md#Scan-9">9</a>, <a href="Changelog.md#Scan-8">8</a>|
 |<a href="#Scatter">Scatter</a> (deprecated)|<a href="Changelog.md#Scatter-11">11</a>, <a href="Changelog.md#Scatter-9">9</a>|
 |<a href="#ScatterElements">ScatterElements</a>|<a href="Changelog.md#ScatterElements-18">18</a>, <a href="Changelog.md#ScatterElements-16">16</a>, <a href="Changelog.md#ScatterElements-13">13</a>, <a href="Changelog.md#ScatterElements-11">11</a>|
 |<a href="#ScatterND">ScatterND</a>|<a href="Changelog.md#ScatterND-18">18</a>, <a href="Changelog.md#ScatterND-16">16</a>, <a href="Changelog.md#ScatterND-13">13</a>, <a href="Changelog.md#ScatterND-11">11</a>|
@@ -136,18 +136,18 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#SequenceErase">SequenceErase</a>|<a href="Changelog.md#SequenceErase-11">11</a>|
 |<a href="#SequenceInsert">SequenceInsert</a>|<a href="Changelog.md#SequenceInsert-11">11</a>|
 |<a href="#SequenceLength">SequenceLength</a>|<a href="Changelog.md#SequenceLength-11">11</a>|
-|<a href="#Shape">Shape</a>|<a href="Changelog.md#Shape-19">19</a>, <a href="Changelog.md#Shape-15">15</a>, <a href="Changelog.md#Shape-13">13</a>, <a href="Changelog.md#Shape-1">1</a>|
+|<a href="#Shape">Shape</a>|<a href="Changelog.md#Shape-21">21</a>, <a href="Changelog.md#Shape-19">19</a>, <a href="Changelog.md#Shape-15">15</a>, <a href="Changelog.md#Shape-13">13</a>, <a href="Changelog.md#Shape-1">1</a>|
 |<a href="#Sigmoid">Sigmoid</a>|<a href="Changelog.md#Sigmoid-13">13</a>, <a href="Changelog.md#Sigmoid-6">6</a>, <a href="Changelog.md#Sigmoid-1">1</a>|
 |<a href="#Sign">Sign</a>|<a href="Changelog.md#Sign-13">13</a>, <a href="Changelog.md#Sign-9">9</a>|
 |<a href="#Sin">Sin</a>|<a href="Changelog.md#Sin-7">7</a>|
 |<a href="#Sinh">Sinh</a>|<a href="Changelog.md#Sinh-9">9</a>|
-|<a href="#Size">Size</a>|<a href="Changelog.md#Size-19">19</a>, <a href="Changelog.md#Size-13">13</a>, <a href="Changelog.md#Size-1">1</a>|
+|<a href="#Size">Size</a>|<a href="Changelog.md#Size-21">21</a>, <a href="Changelog.md#Size-19">19</a>, <a href="Changelog.md#Size-13">13</a>, <a href="Changelog.md#Size-1">1</a>|
 |<a href="#Slice">Slice</a>|<a href="Changelog.md#Slice-13">13</a>, <a href="Changelog.md#Slice-11">11</a>, <a href="Changelog.md#Slice-10">10</a>, <a href="Changelog.md#Slice-1">1</a>|
 |<a href="#SpaceToDepth">SpaceToDepth</a>|<a href="Changelog.md#SpaceToDepth-13">13</a>, <a href="Changelog.md#SpaceToDepth-1">1</a>|
 |<a href="#Split">Split</a>|<a href="Changelog.md#Split-18">18</a>, <a href="Changelog.md#Split-13">13</a>, <a href="Changelog.md#Split-11">11</a>, <a href="Changelog.md#Split-2">2</a>, <a href="Changelog.md#Split-1">1</a>|
 |<a href="#SplitToSequence">SplitToSequence</a>|<a href="Changelog.md#SplitToSequence-11">11</a>|
 |<a href="#Sqrt">Sqrt</a>|<a href="Changelog.md#Sqrt-13">13</a>, <a href="Changelog.md#Sqrt-6">6</a>, <a href="Changelog.md#Sqrt-1">1</a>|
-|<a href="#Squeeze">Squeeze</a>|<a href="Changelog.md#Squeeze-13">13</a>, <a href="Changelog.md#Squeeze-11">11</a>, <a href="Changelog.md#Squeeze-1">1</a>|
+|<a href="#Squeeze">Squeeze</a>|<a href="Changelog.md#Squeeze-21">21</a>, <a href="Changelog.md#Squeeze-13">13</a>, <a href="Changelog.md#Squeeze-11">11</a>, <a href="Changelog.md#Squeeze-1">1</a>|
 |<a href="#StringConcat">StringConcat</a>|<a href="Changelog.md#StringConcat-20">20</a>|
 |<a href="#StringNormalizer">StringNormalizer</a>|<a href="Changelog.md#StringNormalizer-10">10</a>|
 |<a href="#StringSplit">StringSplit</a>|<a href="Changelog.md#StringSplit-20">20</a>|
@@ -158,10 +158,10 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#TfIdfVectorizer">TfIdfVectorizer</a>|<a href="Changelog.md#TfIdfVectorizer-9">9</a>|
 |<a href="#Tile">Tile</a>|<a href="Changelog.md#Tile-13">13</a>, <a href="Changelog.md#Tile-6">6</a>, <a href="Changelog.md#Tile-1">1</a>|
 |<a href="#TopK">TopK</a>|<a href="Changelog.md#TopK-11">11</a>, <a href="Changelog.md#TopK-10">10</a>, <a href="Changelog.md#TopK-1">1</a>|
-|<a href="#Transpose">Transpose</a>|<a href="Changelog.md#Transpose-13">13</a>, <a href="Changelog.md#Transpose-1">1</a>|
+|<a href="#Transpose">Transpose</a>|<a href="Changelog.md#Transpose-21">21</a>, <a href="Changelog.md#Transpose-13">13</a>, <a href="Changelog.md#Transpose-1">1</a>|
 |<a href="#Trilu">Trilu</a>|<a href="Changelog.md#Trilu-14">14</a>|
 |<a href="#Unique">Unique</a>|<a href="Changelog.md#Unique-11">11</a>|
-|<a href="#Unsqueeze">Unsqueeze</a>|<a href="Changelog.md#Unsqueeze-13">13</a>, <a href="Changelog.md#Unsqueeze-11">11</a>, <a href="Changelog.md#Unsqueeze-1">1</a>|
+|<a href="#Unsqueeze">Unsqueeze</a>|<a href="Changelog.md#Unsqueeze-21">21</a>, <a href="Changelog.md#Unsqueeze-13">13</a>, <a href="Changelog.md#Unsqueeze-11">11</a>, <a href="Changelog.md#Unsqueeze-1">1</a>|
 |<a href="#Upsample">Upsample</a> (deprecated)|<a href="Changelog.md#Upsample-10">10</a>, <a href="Changelog.md#Upsample-9">9</a>, <a href="Changelog.md#Upsample-7">7</a>|
 |<a href="#Where">Where</a>|<a href="Changelog.md#Where-16">16</a>, <a href="Changelog.md#Where-9">9</a>|
 |<a href="#Xor">Xor</a>|<a href="Changelog.md#Xor-7">7</a>, <a href="Changelog.md#Xor-1">1</a>|
@@ -169,7 +169,7 @@ For an operator input/output's differentiability, it can be differentiable,
 |<a href="#AffineGrid">AffineGrid</a>|<a href="Changelog.md#AffineGrid-20">20</a>|20|
 |<a href="#Bernoulli">Bernoulli</a>|<a href="Changelog.md#Bernoulli-15">15</a>|15|
 |<a href="#BlackmanWindow">BlackmanWindow</a>|<a href="Changelog.md#BlackmanWindow-17">17</a>|17|
-|<a href="#CastLike">CastLike</a>|<a href="Changelog.md#CastLike-19">19</a>, <a href="Changelog.md#CastLike-15">15</a>|19|
+|<a href="#CastLike">CastLike</a>|<a href="Changelog.md#CastLike-21">21</a>, <a href="Changelog.md#CastLike-19">19</a>, <a href="Changelog.md#CastLike-15">15</a>|21|
 |<a href="#Celu">Celu</a>|<a href="Changelog.md#Celu-12">12</a>|12|
 |<a href="#CenterCropPad">CenterCropPad</a>|<a href="Changelog.md#CenterCropPad-18">18</a>|18|
 |<a href="#Clip">Clip</a>|<a href="Changelog.md#Clip-13">13</a>, <a href="Changelog.md#Clip-12">12</a>, <a href="Changelog.md#Clip-11">11</a>, <a href="Changelog.md#Clip-6">6</a>, <a href="Changelog.md#Clip-1">1</a>|13|
@@ -3375,9 +3375,9 @@ expect(node, inputs=[size], outputs=[y], name="test_blackmanwindow_symmetric")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Cast-1">1</a>, <a href="Changelog.md#Cast-6">6</a>, <a href="Changelog.md#Cast-9">9</a>, <a href="Changelog.md#Cast-13">13</a>
+Other versions of this operator: <a href="Changelog.md#Cast-1">1</a>, <a href="Changelog.md#Cast-6">6</a>, <a href="Changelog.md#Cast-9">9</a>, <a href="Changelog.md#Cast-13">13</a>, <a href="Changelog.md#Cast-19">19</a>
 
 #### Attributes
 
@@ -3405,9 +3405,9 @@ Other versions of this operator: <a href="Changelog.md#Cast-1">1</a>, <a href="C
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain input types. Casting from complex is not supported.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain output types. Casting to complex is not supported.</dd>
 </dl>
 
@@ -3446,10 +3446,27 @@ test_cases = [
     ("FLOAT8E5M2", "FLOAT16"),
     ("FLOAT8E5M2FNUZ", "FLOAT"),
     ("FLOAT8E5M2FNUZ", "FLOAT16"),
+    ("FLOAT", "UINT4"),
+    ("FLOAT16", "UINT4"),
+    ("FLOAT", "INT4"),
+    ("FLOAT16", "INT4"),
+    ("UINT4", "FLOAT"),
+    ("UINT4", "FLOAT16"),
+    ("UINT4", "UINT8"),
+    ("INT4", "FLOAT"),
+    ("INT4", "FLOAT16"),
+    ("INT4", "INT8"),
 ]
 
 vect_float32_to_float8e4m3 = np.vectorize(float32_to_float8e4m3)
 vect_float32_to_float8e5m2 = np.vectorize(float32_to_float8e5m2)
+vect_float32_to_uint4 = np.vectorize(
+    lambda x: subbyte.float32_to_4bit_unpacked(x, signed=False)
+)
+vect_float32_to_int4 = np.vectorize(
+    lambda x: subbyte.float32_to_4bit_unpacked(x, signed=True)
+)
+
 f8_types = ("FLOAT8E4M3FN", "FLOAT8E4M3FNUZ", "FLOAT8E5M2", "FLOAT8E5M2FNUZ")
 
 for from_type, to_type in test_cases:
@@ -3602,6 +3619,59 @@ for from_type, to_type in test_cases:
             "x", getattr(TensorProto, to_type), [3, 5], expected.tolist()
         )
         output = expected_tensor
+    elif from_type in ("UINT4", "INT4") or to_type in ("UINT4", "INT4"):
+        np_fp32 = np.arange(-9, 16).astype(np.float32)
+        input_shape = (5, 5)
+        if from_type == "FLOAT":
+            input_values = np_fp32
+            input = make_tensor(
+                "x", TensorProto.FLOAT, input_shape, input_values.tolist()
+            )
+        elif from_type == "FLOAT16":
+            input_values = np_fp32.astype(np.float16)
+            input = make_tensor(
+                "x", TensorProto.FLOAT16, input_shape, input_values.tolist()
+            )
+        elif from_type == "UINT4":
+            input_values = vect_float32_to_uint4(np_fp32)
+            input = make_tensor(
+                "x", TensorProto.UINT4, input_shape, input_values.tolist()
+            )
+        elif from_type == "INT4":
+            input_values = vect_float32_to_int4(np_fp32)
+            input = make_tensor(
+                "x", TensorProto.INT4, input_shape, input_values.tolist()
+            )
+        else:
+            raise ValueError(
+                "Conversion from {from_type} to {to_type} is not tested."
+            )
+        if to_type == "UINT4":
+            expected = vect_float32_to_uint4(input_values).astype(custom.uint4)
+        elif to_type == "INT4":
+            expected = vect_float32_to_int4(input_values).astype(custom.int4)
+        elif to_type == "FLOAT16":
+            expected = input_values.astype(np.float16)
+        elif to_type == "FLOAT":
+            expected = input_values
+        elif to_type == "UINT8":
+            expected = input_values.astype(np.uint8)
+        elif to_type == "INT8":
+            expected = input_values.astype(np.int8)
+        else:
+            raise ValueError(
+                "Conversion from {from_type} to {to_type} is not tested."
+            )
+        expected_tensor = make_tensor(
+            "y", getattr(TensorProto, to_type), input_shape, expected.tolist()
+        )
+        output = expected_tensor
+        input_type_proto = onnx.helper.make_tensor_type_proto(
+            getattr(TensorProto, from_type), input_shape
+        )
+        output_type_proto = onnx.helper.make_tensor_type_proto(
+            getattr(TensorProto, to_type), input_shape
+        )
 
     elif from_type != "STRING":
         input = np.random.random_sample(shape).astype(
@@ -3773,9 +3843,9 @@ for from_type, to_type in test_cases:
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#CastLike-15">15</a>
+Other versions of this operator: <a href="Changelog.md#CastLike-15">15</a>, <a href="Changelog.md#CastLike-19">19</a>
 
 #### Attributes
 
@@ -3803,9 +3873,9 @@ Other versions of this operator: <a href="Changelog.md#CastLike-15">15</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T1</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain input types. Casting from complex is not supported.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(string), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain output types. Casting to complex is not supported.</dd>
 </dl>
 
@@ -5288,9 +5358,9 @@ This version of the operator has been available since version 11 of the default
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Constant-1">1</a>, <a href="Changelog.md#Constant-9">9</a>, <a href="Changelog.md#Constant-11">11</a>, <a href="Changelog.md#Constant-12">12</a>, <a href="Changelog.md#Constant-13">13</a>
+Other versions of this operator: <a href="Changelog.md#Constant-1">1</a>, <a href="Changelog.md#Constant-9">9</a>, <a href="Changelog.md#Constant-11">11</a>, <a href="Changelog.md#Constant-12">12</a>, <a href="Changelog.md#Constant-13">13</a>, <a href="Changelog.md#Constant-19">19</a>
 
 #### Attributes
 
@@ -5326,7 +5396,7 @@ Other versions of this operator: <a href="Changelog.md#Constant-1">1</a>, <a hre
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain input and output types to all tensor types.</dd>
 </dl>
 
@@ -5362,9 +5432,9 @@ expect(node, inputs=[], outputs=[values], name="test_constant")
 
 #### Version
 
-This version of the operator has been available since version 20 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#ConstantOfShape-9">9</a>
+Other versions of this operator: <a href="Changelog.md#ConstantOfShape-9">9</a>, <a href="Changelog.md#ConstantOfShape-20">20</a>
 
 #### Attributes
 
@@ -5392,7 +5462,7 @@ Other versions of this operator: <a href="Changelog.md#ConstantOfShape-9">9</a>
 <dl>
 <dt><tt>T1</tt> : tensor(int64)</dt>
 <dd>Constrain input types.</dd>
-<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(bool), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T2</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain output types to be numerics.</dd>
 </dl>
 
@@ -7306,7 +7376,7 @@ Other versions of this operator: <a href="Changelog.md#DequantizeLinear-10">10</
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(int32), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T1</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(int32), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>The type of the inputs 'x_zero_point' and 'x'.</dd>
 <dt><tt>T2</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
 <dd>'x_scale' determines the output type.</dd>
@@ -7515,6 +7585,34 @@ expect(
 </details>
 
 
+<details>
+<summary>int4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "DequantizeLinear",
+    inputs=["x", "x_scale", "x_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+# scalar zero point and scale
+x = make_tensor("x", TensorProto.INT4, [5], [0, 1, 7, -4, -8])
+x_scale = np.float32(2)
+x_zero_point = make_tensor("zero_point", TensorProto.INT4, (1,), [1])
+y = np.array([-2, 0, 12, -10, -18], dtype=np.float32)
+
+expect(
+    node,
+    inputs=[x, x_scale, x_zero_point],
+    outputs=[y],
+    name="test_dequantizelinear_int4",
+)
+```
+
+</details>
+
+
 <details>
 <summary>uint16</summary>
 
@@ -7541,6 +7639,34 @@ expect(
 </details>
 
 
+<details>
+<summary>uint4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "DequantizeLinear",
+    inputs=["x", "x_scale", "x_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+# scalar zero point and scale
+x = make_tensor("x", TensorProto.UINT4, [5], [0, 1, 7, 10, 15])
+x_scale = np.float32(2)
+x_zero_point = make_tensor("zero_point", TensorProto.UINT4, (1,), [1])
+y = np.array([-2, 0, 12, 18, 28], dtype=np.float32)
+
+expect(
+    node,
+    inputs=[x, x_scale, x_zero_point],
+    outputs=[y],
+    name="test_dequantizelinear_uint4",
+)
+```
+
+</details>
+
+
 ### <a name="Det"></a><a name="det">**Det**</a>
 
   Det calculates determinant of a square matrix or batches of square matrices.
@@ -8833,9 +8959,9 @@ expect(node, inputs=[x], outputs=[y], name="test_eyelike_without_dtype")
 
 #### Version
 
-This version of the operator has been available since version 13 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Flatten-1">1</a>, <a href="Changelog.md#Flatten-9">9</a>, <a href="Changelog.md#Flatten-11">11</a>
+Other versions of this operator: <a href="Changelog.md#Flatten-1">1</a>, <a href="Changelog.md#Flatten-9">9</a>, <a href="Changelog.md#Flatten-11">11</a>, <a href="Changelog.md#Flatten-13">13</a>
 
 #### Attributes
 
@@ -8861,8 +8987,8 @@ Other versions of this operator: <a href="Changelog.md#Flatten-1">1</a>, <a href
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
-<dd>Constrain input and output to all tensor types.</dd>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output to all tensor types up to IRv10.</dd>
 </dl>
 
 
@@ -11798,9 +11924,9 @@ expect(node, inputs=[x], outputs=[y], name="test_hardmax_default_axis")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Identity-1">1</a>, <a href="Changelog.md#Identity-13">13</a>, <a href="Changelog.md#Identity-14">14</a>, <a href="Changelog.md#Identity-16">16</a>
+Other versions of this operator: <a href="Changelog.md#Identity-1">1</a>, <a href="Changelog.md#Identity-13">13</a>, <a href="Changelog.md#Identity-14">14</a>, <a href="Changelog.md#Identity-16">16</a>, <a href="Changelog.md#Identity-19">19</a>
 
 #### Inputs
 
@@ -11819,7 +11945,7 @@ Other versions of this operator: <a href="Changelog.md#Identity-1">1</a>, <a hre
 #### Type Constraints
 
 <dl>
-<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128))</dt>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128))</dt>
 <dd>Constrain input and output types to all tensor, sequence, and optional types.</dd>
 </dl>
 
@@ -11931,9 +12057,9 @@ expect(node, inputs=[data], outputs=[data], name="test_identity_sequence")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#If-1">1</a>, <a href="Changelog.md#If-11">11</a>, <a href="Changelog.md#If-13">13</a>, <a href="Changelog.md#If-16">16</a>
+Other versions of this operator: <a href="Changelog.md#If-1">1</a>, <a href="Changelog.md#If-11">11</a>, <a href="Changelog.md#If-13">13</a>, <a href="Changelog.md#If-16">16</a>, <a href="Changelog.md#If-19">19</a>
 
 #### Attributes
 
@@ -11961,8 +12087,8 @@ Other versions of this operator: <a href="Changelog.md#If-1">1</a>, <a href="Cha
 #### Type Constraints
 
 <dl>
-<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(bfloat16)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(bfloat16))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(bfloat16)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128)), optional(tensor(float8e4m3fn)), optional(tensor(float8e4m3fnuz)), optional(tensor(float8e5m2)), optional(tensor(float8e5m2fnuz))</dt>
-<dd>All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv9.</dd>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(bfloat16)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(uint4)), seq(tensor(int4)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(bfloat16))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(bfloat16)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128)), optional(tensor(float8e4m3fn)), optional(tensor(float8e4m3fnuz)), optional(tensor(float8e5m2)), optional(tensor(float8e5m2fnuz)), optional(tensor(uint4)), optional(tensor(int4))</dt>
+<dd>All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv10.</dd>
 <dt><tt>B</tt> : tensor(bool)</dt>
 <dd>Only bool</dd>
 </dl>
@@ -13972,9 +14098,9 @@ expect(node, inputs=[x], outputs=[y], name="test_logsoftmax_default_axis")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Loop-1">1</a>, <a href="Changelog.md#Loop-11">11</a>, <a href="Changelog.md#Loop-13">13</a>, <a href="Changelog.md#Loop-16">16</a>
+Other versions of this operator: <a href="Changelog.md#Loop-1">1</a>, <a href="Changelog.md#Loop-11">11</a>, <a href="Changelog.md#Loop-13">13</a>, <a href="Changelog.md#Loop-16">16</a>, <a href="Changelog.md#Loop-19">19</a>
 
 #### Attributes
 
@@ -14004,8 +14130,8 @@ Other versions of this operator: <a href="Changelog.md#Loop-1">1</a>, <a href="C
 #### Type Constraints
 
 <dl>
-<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(bfloat16)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(bfloat16))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(bfloat16)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128)), optional(tensor(float8e4m3fn)), optional(tensor(float8e4m3fnuz)), optional(tensor(float8e5m2)), optional(tensor(float8e5m2fnuz))</dt>
-<dd>All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv9.</dd>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4), seq(tensor(uint8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(int8)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(bfloat16)), seq(tensor(float16)), seq(tensor(float)), seq(tensor(double)), seq(tensor(string)), seq(tensor(bool)), seq(tensor(complex64)), seq(tensor(complex128)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(uint4)), seq(tensor(int4)), optional(seq(tensor(uint8))), optional(seq(tensor(uint16))), optional(seq(tensor(uint32))), optional(seq(tensor(uint64))), optional(seq(tensor(int8))), optional(seq(tensor(int16))), optional(seq(tensor(int32))), optional(seq(tensor(int64))), optional(seq(tensor(bfloat16))), optional(seq(tensor(float16))), optional(seq(tensor(float))), optional(seq(tensor(double))), optional(seq(tensor(string))), optional(seq(tensor(bool))), optional(seq(tensor(complex64))), optional(seq(tensor(complex128))), optional(tensor(uint8)), optional(tensor(uint16)), optional(tensor(uint32)), optional(tensor(uint64)), optional(tensor(int8)), optional(tensor(int16)), optional(tensor(int32)), optional(tensor(int64)), optional(tensor(bfloat16)), optional(tensor(float16)), optional(tensor(float)), optional(tensor(double)), optional(tensor(string)), optional(tensor(bool)), optional(tensor(complex64)), optional(tensor(complex128)), optional(tensor(float8e4m3fn)), optional(tensor(float8e4m3fnuz)), optional(tensor(float8e5m2)), optional(tensor(float8e5m2fnuz)), optional(tensor(uint4)), optional(tensor(int4))</dt>
+<dd>All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv10.</dd>
 <dt><tt>I</tt> : tensor(int64)</dt>
 <dd>tensor of int64, which should be a scalar.</dd>
 <dt><tt>B</tt> : tensor(bool)</dt>
@@ -19311,9 +19437,9 @@ expect(node, inputs=[x, slope], outputs=[y], name="test_prelu_broadcast")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Pad-1">1</a>, <a href="Changelog.md#Pad-2">2</a>, <a href="Changelog.md#Pad-11">11</a>, <a href="Changelog.md#Pad-13">13</a>, <a href="Changelog.md#Pad-18">18</a>
+Other versions of this operator: <a href="Changelog.md#Pad-1">1</a>, <a href="Changelog.md#Pad-2">2</a>, <a href="Changelog.md#Pad-11">11</a>, <a href="Changelog.md#Pad-13">13</a>, <a href="Changelog.md#Pad-18">18</a>, <a href="Changelog.md#Pad-19">19</a>
 
 #### Attributes
 
@@ -19345,8 +19471,8 @@ Other versions of this operator: <a href="Changelog.md#Pad-1">1</a>, <a href="Ch
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
-<dd>Constrain input and output types to all tensor types.</dd>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types up to IRv10.</dd>
 <dt><tt>Tind</tt> : tensor(int32), tensor(int64)</dt>
 <dd>Constrain indices to integer types</dd>
 </dl>
@@ -19980,7 +20106,8 @@ for quant_type_name in ["uint8", "int8"]:
   The linear quantization operator. It consumes a high precision tensor, a scale, and a zero point to compute the low precision / quantized tensor.
   The scale factor and zero point must have same shape, and can be either a scalar for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
   The quantization formula is `y = saturate ((x / y_scale) + y_zero_point)`.
-  For saturation, it saturates to [0, 255] if it's uint8, [-128, 127] if it's int8, [0, 65535] if it's uint16, or [-32768, 32767] if it's int16.
+  For saturation, it saturates according to:
+  uint8: [0, 255], int8: [-128, 127], uint16: [0, 65535], int16: [-32768, 32767], uint4: [0, 15], int4: [-8, 7]
   For (x / y_scale), it's rounding to the nearest even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
   'y_zero_point' and 'y' must have same type.
   'y_zero_point' is usually not used for quantization to float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz,
@@ -20025,7 +20152,7 @@ Other versions of this operator: <a href="Changelog.md#QuantizeLinear-10">10</a>
 <dl>
 <dt><tt>T1</tt> : tensor(float), tensor(float16), tensor(bfloat16), tensor(int32)</dt>
 <dd>The type of the input 'x'.</dd>
-<dt><tt>T2</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T2</tt> : tensor(int8), tensor(uint8), tensor(int16), tensor(uint16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>The type of the input 'y_zero_point' and the output 'y'.</dd>
 </dl>
 
@@ -20082,9 +20209,7 @@ node = onnx.helper.make_node(
 x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
 y_scale = np.float32(2)
 y_zero_point = make_tensor("zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
-y = make_tensor(
-    "zero_point", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96]
-)
+y = make_tensor("y", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96])
 
 expect(
     node,
@@ -20110,9 +20235,7 @@ node = onnx.helper.make_node(
 x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
 y_scale = np.float32(2)
 y_zero_point = make_tensor("zero_point", TensorProto.FLOAT8E5M2, [1], [0.0])
-y = make_tensor(
-    "zero_point", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96]
-)
+y = make_tensor("y", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
 
 expect(
     node,
@@ -20189,6 +20312,44 @@ expect(
 </details>
 
 
+<details>
+<summary>int4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "QuantizeLinear",
+    inputs=["x", "y_scale", "y_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+x = np.array(
+    [
+        [0.0, 2.5, 4.8, 8.6],
+        [-30, -20, 6, 9],
+        [12, 15, 16, 40],
+    ]
+).astype(np.float32)
+
+y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+y_zero_point = make_tensor(
+    "zero_point", TensorProto.INT4, y_scale.shape, np.ones_like(y_scale)
+)
+y = make_tensor(
+    "y", TensorProto.INT4, x.shape, [1, 2, 3, 5, -8, -6, 3, 4, 4, 5, 5, 7]
+)
+
+expect(
+    node,
+    inputs=[x, y_scale, y_zero_point],
+    outputs=[y],
+    name="test_quantizelinear_int4",
+)
+```
+
+</details>
+
+
 <details>
 <summary>quantizelinear</summary>
 
@@ -20271,6 +20432,44 @@ expect(
 </details>
 
 
+<details>
+<summary>uint4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "QuantizeLinear",
+    inputs=["x", "y_scale", "y_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+x = np.array(
+    [
+        [0.0, 2.5, 4.8, 8.6],
+        [-30, -20, 6, 9],
+        [12, 15, 16, 40],
+    ]
+).astype(np.float32)
+
+y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+y_zero_point = make_tensor(
+    "zero_point", TensorProto.UINT4, y_scale.shape, np.ones_like(y_scale)
+)
+y = make_tensor(
+    "y", TensorProto.UINT4, x.shape, [1, 2, 3, 5, -1, -1, 3, 4, 4, 5, 5, 11]
+)
+
+expect(
+    node,
+    inputs=[x, y_scale, y_zero_point],
+    outputs=[y],
+    name="test_quantizelinear_uint4",
+)
+```
+
+</details>
+
+
 ### <a name="RNN"></a><a name="rnn">**RNN**</a>
 
   Computes an one-layer simple RNN. This operator is usually supported
@@ -23762,9 +23961,9 @@ expect(node, inputs=[x], outputs=[y], name="test_relu")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Reshape-1">1</a>, <a href="Changelog.md#Reshape-5">5</a>, <a href="Changelog.md#Reshape-13">13</a>, <a href="Changelog.md#Reshape-14">14</a>
+Other versions of this operator: <a href="Changelog.md#Reshape-1">1</a>, <a href="Changelog.md#Reshape-5">5</a>, <a href="Changelog.md#Reshape-13">13</a>, <a href="Changelog.md#Reshape-14">14</a>, <a href="Changelog.md#Reshape-19">19</a>
 
 #### Attributes
 
@@ -23792,7 +23991,7 @@ Other versions of this operator: <a href="Changelog.md#Reshape-1">1</a>, <a href
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain input and output types to all tensor types.</dd>
 </dl>
 
@@ -26810,9 +27009,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Scan-8">8</a>, <a href="Changelog.md#Scan-9">9</a>, <a href="Changelog.md#Scan-11">11</a>, <a href="Changelog.md#Scan-16">16</a>
+Other versions of this operator: <a href="Changelog.md#Scan-8">8</a>, <a href="Changelog.md#Scan-9">9</a>, <a href="Changelog.md#Scan-11">11</a>, <a href="Changelog.md#Scan-16">16</a>, <a href="Changelog.md#Scan-19">19</a>
 
 #### Attributes
 
@@ -26848,8 +27047,8 @@ Other versions of this operator: <a href="Changelog.md#Scan-8">8</a>, <a href="C
 #### Type Constraints
 
 <dl>
-<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
-<dd>All Tensor types up to IRv9.</dd>
+<dt><tt>V</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>All Tensor types up to IRv10.</dd>
 </dl>
 
 
@@ -28520,9 +28719,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Shape-1">1</a>, <a href="Changelog.md#Shape-13">13</a>, <a href="Changelog.md#Shape-15">15</a>
+Other versions of this operator: <a href="Changelog.md#Shape-1">1</a>, <a href="Changelog.md#Shape-13">13</a>, <a href="Changelog.md#Shape-15">15</a>, <a href="Changelog.md#Shape-19">19</a>
 
 #### Attributes
 
@@ -28550,7 +28749,7 @@ Other versions of this operator: <a href="Changelog.md#Shape-1">1</a>, <a href="
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Input tensor can be of arbitrary type.</dd>
 <dt><tt>T1</tt> : tensor(int64)</dt>
 <dd>Constrain output to int64 tensor.</dd>
@@ -28903,9 +29102,9 @@ expect(node, inputs=[x], outputs=[y], name="test_sinh")
 
 #### Version
 
-This version of the operator has been available since version 19 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Size-1">1</a>, <a href="Changelog.md#Size-13">13</a>
+Other versions of this operator: <a href="Changelog.md#Size-1">1</a>, <a href="Changelog.md#Size-13">13</a>, <a href="Changelog.md#Size-19">19</a>
 
 #### Inputs
 
@@ -28924,7 +29123,7 @@ Other versions of this operator: <a href="Changelog.md#Size-1">1</a>, <a href="C
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)</dt>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Input tensor can be of arbitrary type.</dd>
 <dt><tt>T1</tt> : tensor(int64)</dt>
 <dd>Constrain output to int64 tensor, which should be a scalar though.</dd>
@@ -31677,9 +31876,9 @@ expect(node, inputs=[x], outputs=[y], name="test_sqrt")
 
 #### Version
 
-This version of the operator has been available since version 13 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Squeeze-1">1</a>, <a href="Changelog.md#Squeeze-11">11</a>
+Other versions of this operator: <a href="Changelog.md#Squeeze-1">1</a>, <a href="Changelog.md#Squeeze-11">11</a>, <a href="Changelog.md#Squeeze-13">13</a>
 
 #### Inputs (1 - 2)
 
@@ -31700,8 +31899,8 @@ Other versions of this operator: <a href="Changelog.md#Squeeze-1">1</a>, <a href
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
-<dd>Constrain input and output types to all tensor types.</dd>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types up to IRv10.</dd>
 </dl>
 
 
@@ -33247,9 +33446,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 13 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Transpose-1">1</a>
+Other versions of this operator: <a href="Changelog.md#Transpose-1">1</a>, <a href="Changelog.md#Transpose-13">13</a>
 
 #### Attributes
 
@@ -33275,7 +33474,7 @@ Other versions of this operator: <a href="Changelog.md#Transpose-1">1</a>
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
 <dd>Constrain input and output types to all tensor types.</dd>
 </dl>
 
@@ -34279,9 +34478,9 @@ expect(
 
 #### Version
 
-This version of the operator has been available since version 13 of the default ONNX operator set.
+This version of the operator has been available since version 21 of the default ONNX operator set.
 
-Other versions of this operator: <a href="Changelog.md#Unsqueeze-1">1</a>, <a href="Changelog.md#Unsqueeze-11">11</a>
+Other versions of this operator: <a href="Changelog.md#Unsqueeze-1">1</a>, <a href="Changelog.md#Unsqueeze-11">11</a>, <a href="Changelog.md#Unsqueeze-13">13</a>
 
 #### Inputs
 
@@ -34302,8 +34501,8 @@ Other versions of this operator: <a href="Changelog.md#Unsqueeze-1">1</a>, <a hr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
-<dd>Constrain input and output types to all tensor types.</dd>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(bfloat16), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(uint4), tensor(int4)</dt>
+<dd>Constrain input and output types to all tensor types up to IRv10.</dd>
 </dl>
 
 
diff --git a/docs/TestCoverage.md b/docs/TestCoverage.md
index 862d3272ba0..d17302e6cf5 100644
--- a/docs/TestCoverage.md
+++ b/docs/TestCoverage.md
@@ -2330,10 +2330,27 @@ test_cases = [
     ("FLOAT8E5M2", "FLOAT16"),
     ("FLOAT8E5M2FNUZ", "FLOAT"),
     ("FLOAT8E5M2FNUZ", "FLOAT16"),
+    ("FLOAT", "UINT4"),
+    ("FLOAT16", "UINT4"),
+    ("FLOAT", "INT4"),
+    ("FLOAT16", "INT4"),
+    ("UINT4", "FLOAT"),
+    ("UINT4", "FLOAT16"),
+    ("UINT4", "UINT8"),
+    ("INT4", "FLOAT"),
+    ("INT4", "FLOAT16"),
+    ("INT4", "INT8"),
 ]
 
 vect_float32_to_float8e4m3 = np.vectorize(float32_to_float8e4m3)
 vect_float32_to_float8e5m2 = np.vectorize(float32_to_float8e5m2)
+vect_float32_to_uint4 = np.vectorize(
+    lambda x: subbyte.float32_to_4bit_unpacked(x, signed=False)
+)
+vect_float32_to_int4 = np.vectorize(
+    lambda x: subbyte.float32_to_4bit_unpacked(x, signed=True)
+)
+
 f8_types = ("FLOAT8E4M3FN", "FLOAT8E4M3FNUZ", "FLOAT8E5M2", "FLOAT8E5M2FNUZ")
 
 for from_type, to_type in test_cases:
@@ -2486,6 +2503,59 @@ for from_type, to_type in test_cases:
             "x", getattr(TensorProto, to_type), [3, 5], expected.tolist()
         )
         output = expected_tensor
+    elif from_type in ("UINT4", "INT4") or to_type in ("UINT4", "INT4"):
+        np_fp32 = np.arange(-9, 16).astype(np.float32)
+        input_shape = (5, 5)
+        if from_type == "FLOAT":
+            input_values = np_fp32
+            input = make_tensor(
+                "x", TensorProto.FLOAT, input_shape, input_values.tolist()
+            )
+        elif from_type == "FLOAT16":
+            input_values = np_fp32.astype(np.float16)
+            input = make_tensor(
+                "x", TensorProto.FLOAT16, input_shape, input_values.tolist()
+            )
+        elif from_type == "UINT4":
+            input_values = vect_float32_to_uint4(np_fp32)
+            input = make_tensor(
+                "x", TensorProto.UINT4, input_shape, input_values.tolist()
+            )
+        elif from_type == "INT4":
+            input_values = vect_float32_to_int4(np_fp32)
+            input = make_tensor(
+                "x", TensorProto.INT4, input_shape, input_values.tolist()
+            )
+        else:
+            raise ValueError(
+                "Conversion from {from_type} to {to_type} is not tested."
+            )
+        if to_type == "UINT4":
+            expected = vect_float32_to_uint4(input_values).astype(custom.uint4)
+        elif to_type == "INT4":
+            expected = vect_float32_to_int4(input_values).astype(custom.int4)
+        elif to_type == "FLOAT16":
+            expected = input_values.astype(np.float16)
+        elif to_type == "FLOAT":
+            expected = input_values
+        elif to_type == "UINT8":
+            expected = input_values.astype(np.uint8)
+        elif to_type == "INT8":
+            expected = input_values.astype(np.int8)
+        else:
+            raise ValueError(
+                "Conversion from {from_type} to {to_type} is not tested."
+            )
+        expected_tensor = make_tensor(
+            "y", getattr(TensorProto, to_type), input_shape, expected.tolist()
+        )
+        output = expected_tensor
+        input_type_proto = onnx.helper.make_tensor_type_proto(
+            getattr(TensorProto, from_type), input_shape
+        )
+        output_type_proto = onnx.helper.make_tensor_type_proto(
+            getattr(TensorProto, to_type), input_shape
+        )
 
     elif from_type != "STRING":
         input = np.random.random_sample(shape).astype(
@@ -5105,7 +5175,7 @@ expect(node, inputs=[x], outputs=[y], name="test_depthtospace_example")
 
 
 ### DequantizeLinear
-There are 8 test cases, listed as following:
+There are 10 test cases, listed as following:
 <details>
 <summary>axis</summary>
 
@@ -5291,6 +5361,32 @@ expect(
 )
 ```
 
+</details>
+<details>
+<summary>int4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "DequantizeLinear",
+    inputs=["x", "x_scale", "x_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+# scalar zero point and scale
+x = make_tensor("x", TensorProto.INT4, [5], [0, 1, 7, -4, -8])
+x_scale = np.float32(2)
+x_zero_point = make_tensor("zero_point", TensorProto.INT4, (1,), [1])
+y = np.array([-2, 0, 12, -10, -18], dtype=np.float32)
+
+expect(
+    node,
+    inputs=[x, x_scale, x_zero_point],
+    outputs=[y],
+    name="test_dequantizelinear_int4",
+)
+```
+
 </details>
 <details>
 <summary>uint16</summary>
@@ -5315,6 +5411,32 @@ expect(
 )
 ```
 
+</details>
+<details>
+<summary>uint4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "DequantizeLinear",
+    inputs=["x", "x_scale", "x_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+# scalar zero point and scale
+x = make_tensor("x", TensorProto.UINT4, [5], [0, 1, 7, 10, 15])
+x_scale = np.float32(2)
+x_zero_point = make_tensor("zero_point", TensorProto.UINT4, (1,), [1])
+y = np.array([-2, 0, 12, 18, 28], dtype=np.float32)
+
+expect(
+    node,
+    inputs=[x, x_scale, x_zero_point],
+    outputs=[y],
+    name="test_dequantizelinear_uint4",
+)
+```
+
 </details>
 
 
@@ -13546,7 +13668,7 @@ for quant_type_name in ["uint8", "int8"]:
 
 
 ### QuantizeLinear
-There are 6 test cases, listed as following:
+There are 8 test cases, listed as following:
 <details>
 <summary>axis</summary>
 
@@ -13595,9 +13717,7 @@ node = onnx.helper.make_node(
 x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
 y_scale = np.float32(2)
 y_zero_point = make_tensor("zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
-y = make_tensor(
-    "zero_point", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96]
-)
+y = make_tensor("y", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96])
 
 expect(
     node,
@@ -13621,9 +13741,7 @@ node = onnx.helper.make_node(
 x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
 y_scale = np.float32(2)
 y_zero_point = make_tensor("zero_point", TensorProto.FLOAT8E5M2, [1], [0.0])
-y = make_tensor(
-    "zero_point", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96]
-)
+y = make_tensor("y", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
 
 expect(
     node,
@@ -13695,6 +13813,42 @@ expect(
 )
 ```
 
+</details>
+<details>
+<summary>int4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "QuantizeLinear",
+    inputs=["x", "y_scale", "y_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+x = np.array(
+    [
+        [0.0, 2.5, 4.8, 8.6],
+        [-30, -20, 6, 9],
+        [12, 15, 16, 40],
+    ]
+).astype(np.float32)
+
+y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+y_zero_point = make_tensor(
+    "zero_point", TensorProto.INT4, y_scale.shape, np.ones_like(y_scale)
+)
+y = make_tensor(
+    "y", TensorProto.INT4, x.shape, [1, 2, 3, 5, -8, -6, 3, 4, 4, 5, 5, 7]
+)
+
+expect(
+    node,
+    inputs=[x, y_scale, y_zero_point],
+    outputs=[y],
+    name="test_quantizelinear_int4",
+)
+```
+
 </details>
 <details>
 <summary>quantizelinear</summary>
@@ -13773,6 +13927,42 @@ expect(
 )
 ```
 
+</details>
+<details>
+<summary>uint4</summary>
+
+```python
+node = onnx.helper.make_node(
+    "QuantizeLinear",
+    inputs=["x", "y_scale", "y_zero_point"],
+    outputs=["y"],
+    axis=0,
+)
+
+x = np.array(
+    [
+        [0.0, 2.5, 4.8, 8.6],
+        [-30, -20, 6, 9],
+        [12, 15, 16, 40],
+    ]
+).astype(np.float32)
+
+y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+y_zero_point = make_tensor(
+    "zero_point", TensorProto.UINT4, y_scale.shape, np.ones_like(y_scale)
+)
+y = make_tensor(
+    "y", TensorProto.UINT4, x.shape, [1, 2, 3, 5, -1, -1, 3, 4, 4, 5, 5, 11]
+)
+
+expect(
+    node,
+    inputs=[x, y_scale, y_zero_point],
+    outputs=[y],
+    name="test_quantizelinear_uint4",
+)
+```
+
 </details>
 
 
diff --git a/docs/docsgen/source/technical/index.md b/docs/docsgen/source/technical/index.md
index 5981cc5cebe..20d16834675 100644
--- a/docs/docsgen/source/technical/index.md
+++ b/docs/docsgen/source/technical/index.md
@@ -15,4 +15,5 @@ deeper than the code documentation.
 :maxdepth: 2
 
 float8
+int4
 ```
diff --git a/docs/docsgen/source/technical/int4.md b/docs/docsgen/source/technical/int4.md
new file mode 100644
index 00000000000..0f78ebaebcc
--- /dev/null
+++ b/docs/docsgen/source/technical/int4.md
@@ -0,0 +1,55 @@
+<!--
+Copyright (c) ONNX Project Contributors
+
+SPDX-License-Identifier: Apache-2.0
+-->
+
+(onnx-detail-int4)=
+
+# 4 bit integer types
+
+## Papers
+
+Several papers have been published in 2023 to introduce 4 bit integers and their usage in LLMs. Although their range is
+limited, with careful selection of scaling parameters, good accuracy is obtained when used for compression of weights
+(weight-only quantization), and in some cases for quantization of activations as well.
+
+[AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978)
+Activation-aware Weight Quantization (AWQ) focuses on the quantization of weights in LLMs by considering the
+observation that not all weights are equally important. The method aims to protect salient weights based on the
+activation, rather than relying on backpropagation or reconstruction techniques. By searching for the optimal
+per-channel scaling that preserves the crucial weights, AWQ aims to minimize quantization errors.
+
+[GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers](https://arxiv.org/abs/2210.17323)
+GPTQ proposes a one-shot weight quantization method based on approximate second-order information. GPTQ achieves
+significant compression gains, reducing the bit-width to 3 or 4 bits per weight with negligible accuracy degradation
+compared to the uncompressed baseline.
+
+[Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases](https://arxiv.org/abs/2301.12017)
+This paper discusses quantization of both weights and activations to 4 bit (W4A4). Results indicate that W4A4
+quantization leads to little to no accuracy degradation for encoder-only and encoder-decoder models but results in
+a significant accuracy drop for decoder-only models. To realize the performance gains using W4A4, the study introduces
+a highly optimized end-to-end W4A4 encoder inference pipeline that supports various quantization strategies.
+
+As a result, two new types were introduced in `onnx==1.17.0` supporting a limited set of operators to enable compression using
+4 bit data-types:
+- `UINT4`: 4 bit unsigned integer, values in range [0, 15]
+- `INT4`: 4 bit signed integer, using two's complement represntation. Values in range [-8, 7].
+
+## Cast
+
+Cast from 4 bit to any higher precision type is exact.
+Cast to a 4 bit type is done by rounding to the nearest-integer (with ties to even)
+nearest-even integer and truncating.
+
+## Packing and Unpacking
+
+All 4 bit types are stored as 2x4bit in a single byte.
+The first element is stored in the 4 LSB and the second element is stored in the 4 MSB.
+i.e. for elements x, y, that are consecutive elements in the array:
+```{eval-rst}
+pack(x,y): y << 4 | x & 0x0F
+unpack(z): x = z & 0x0F, y = z >> 4
+```
+In case the total number of elements is odd, padding of 4 bits will be appended.
+The storage size of a 4 bit tensor of size `N` is `ceil(N/2)`.
diff --git a/onnx/backend/test/case/node/cast.py b/onnx/backend/test/case/node/cast.py
index 0db98c27d42..995c1be158a 100644
--- a/onnx/backend/test/case/node/cast.py
+++ b/onnx/backend/test/case/node/cast.py
@@ -7,7 +7,8 @@
 import numpy as np
 
 import onnx
-from onnx import TensorProto, helper
+import onnx.reference.custom_element_types as custom
+from onnx import TensorProto, helper, subbyte
 from onnx.backend.test.case.base import Base
 from onnx.backend.test.case.node import expect
 from onnx.helper import (
@@ -50,10 +51,27 @@ def export() -> None:
             ("FLOAT8E5M2", "FLOAT16"),
             ("FLOAT8E5M2FNUZ", "FLOAT"),
             ("FLOAT8E5M2FNUZ", "FLOAT16"),
+            ("FLOAT", "UINT4"),
+            ("FLOAT16", "UINT4"),
+            ("FLOAT", "INT4"),
+            ("FLOAT16", "INT4"),
+            ("UINT4", "FLOAT"),
+            ("UINT4", "FLOAT16"),
+            ("UINT4", "UINT8"),
+            ("INT4", "FLOAT"),
+            ("INT4", "FLOAT16"),
+            ("INT4", "INT8"),
         ]
 
         vect_float32_to_float8e4m3 = np.vectorize(float32_to_float8e4m3)
         vect_float32_to_float8e5m2 = np.vectorize(float32_to_float8e5m2)
+        vect_float32_to_uint4 = np.vectorize(
+            lambda x: subbyte.float32_to_4bit_unpacked(x, signed=False)
+        )
+        vect_float32_to_int4 = np.vectorize(
+            lambda x: subbyte.float32_to_4bit_unpacked(x, signed=True)
+        )
+
         f8_types = ("FLOAT8E4M3FN", "FLOAT8E4M3FNUZ", "FLOAT8E5M2", "FLOAT8E5M2FNUZ")
 
         for from_type, to_type in test_cases:
@@ -206,6 +224,59 @@ def export() -> None:
                     "x", getattr(TensorProto, to_type), [3, 5], expected.tolist()
                 )
                 output = expected_tensor
+            elif from_type in ("UINT4", "INT4") or to_type in ("UINT4", "INT4"):
+                np_fp32 = np.arange(-9, 16).astype(np.float32)
+                input_shape = (5, 5)
+                if from_type == "FLOAT":
+                    input_values = np_fp32
+                    input = make_tensor(
+                        "x", TensorProto.FLOAT, input_shape, input_values.tolist()
+                    )
+                elif from_type == "FLOAT16":
+                    input_values = np_fp32.astype(np.float16)
+                    input = make_tensor(
+                        "x", TensorProto.FLOAT16, input_shape, input_values.tolist()
+                    )
+                elif from_type == "UINT4":
+                    input_values = vect_float32_to_uint4(np_fp32)
+                    input = make_tensor(
+                        "x", TensorProto.UINT4, input_shape, input_values.tolist()
+                    )
+                elif from_type == "INT4":
+                    input_values = vect_float32_to_int4(np_fp32)
+                    input = make_tensor(
+                        "x", TensorProto.INT4, input_shape, input_values.tolist()
+                    )
+                else:
+                    raise ValueError(
+                        "Conversion from {from_type} to {to_type} is not tested."
+                    )
+                if to_type == "UINT4":
+                    expected = vect_float32_to_uint4(input_values).astype(custom.uint4)
+                elif to_type == "INT4":
+                    expected = vect_float32_to_int4(input_values).astype(custom.int4)
+                elif to_type == "FLOAT16":
+                    expected = input_values.astype(np.float16)
+                elif to_type == "FLOAT":
+                    expected = input_values
+                elif to_type == "UINT8":
+                    expected = input_values.astype(np.uint8)
+                elif to_type == "INT8":
+                    expected = input_values.astype(np.int8)
+                else:
+                    raise ValueError(
+                        "Conversion from {from_type} to {to_type} is not tested."
+                    )
+                expected_tensor = make_tensor(
+                    "y", getattr(TensorProto, to_type), input_shape, expected.tolist()
+                )
+                output = expected_tensor
+                input_type_proto = onnx.helper.make_tensor_type_proto(
+                    getattr(TensorProto, from_type), input_shape
+                )
+                output_type_proto = onnx.helper.make_tensor_type_proto(
+                    getattr(TensorProto, to_type), input_shape
+                )
 
             elif from_type != "STRING":
                 input = np.random.random_sample(shape).astype(
diff --git a/onnx/backend/test/case/node/dequantizelinear.py b/onnx/backend/test/case/node/dequantizelinear.py
index b19c9b3ab39..00f894dda2b 100644
--- a/onnx/backend/test/case/node/dequantizelinear.py
+++ b/onnx/backend/test/case/node/dequantizelinear.py
@@ -189,3 +189,47 @@ def export_int16() -> None:
             outputs=[y],
             name="test_dequantizelinear_int16",
         )
+
+    @staticmethod
+    def export_uint4() -> None:
+        node = onnx.helper.make_node(
+            "DequantizeLinear",
+            inputs=["x", "x_scale", "x_zero_point"],
+            outputs=["y"],
+            axis=0,
+        )
+
+        # scalar zero point and scale
+        x = make_tensor("x", TensorProto.UINT4, [5], [0, 1, 7, 10, 15])
+        x_scale = np.float32(2)
+        x_zero_point = make_tensor("zero_point", TensorProto.UINT4, (1,), [1])
+        y = np.array([-2, 0, 12, 18, 28], dtype=np.float32)
+
+        expect(
+            node,
+            inputs=[x, x_scale, x_zero_point],
+            outputs=[y],
+            name="test_dequantizelinear_uint4",
+        )
+
+    @staticmethod
+    def export_int4() -> None:
+        node = onnx.helper.make_node(
+            "DequantizeLinear",
+            inputs=["x", "x_scale", "x_zero_point"],
+            outputs=["y"],
+            axis=0,
+        )
+
+        # scalar zero point and scale
+        x = make_tensor("x", TensorProto.INT4, [5], [0, 1, 7, -4, -8])
+        x_scale = np.float32(2)
+        x_zero_point = make_tensor("zero_point", TensorProto.INT4, (1,), [1])
+        y = np.array([-2, 0, 12, -10, -18], dtype=np.float32)
+
+        expect(
+            node,
+            inputs=[x, x_scale, x_zero_point],
+            outputs=[y],
+            name="test_dequantizelinear_int4",
+        )
diff --git a/onnx/backend/test/case/node/quantizelinear.py b/onnx/backend/test/case/node/quantizelinear.py
index 30a652ca06d..481e316066e 100644
--- a/onnx/backend/test/case/node/quantizelinear.py
+++ b/onnx/backend/test/case/node/quantizelinear.py
@@ -74,9 +74,7 @@ def export_e4m3fn() -> None:
         x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
         y_scale = np.float32(2)
         y_zero_point = make_tensor("zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
-        y = make_tensor(
-            "zero_point", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96]
-        )
+        y = make_tensor("y", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96])
 
         expect(
             node,
@@ -96,9 +94,7 @@ def export_e5m2() -> None:
         x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
         y_scale = np.float32(2)
         y_zero_point = make_tensor("zero_point", TensorProto.FLOAT8E5M2, [1], [0.0])
-        y = make_tensor(
-            "zero_point", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96]
-        )
+        y = make_tensor("y", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
 
         expect(
             node,
@@ -214,3 +210,67 @@ def export_int16() -> None:
             outputs=[y],
             name="test_quantizelinear_int16",
         )
+
+    @staticmethod
+    def export_uint4() -> None:
+        node = onnx.helper.make_node(
+            "QuantizeLinear",
+            inputs=["x", "y_scale", "y_zero_point"],
+            outputs=["y"],
+            axis=0,
+        )
+
+        x = np.array(
+            [
+                [0.0, 2.5, 4.8, 8.6],
+                [-30, -20, 6, 9],
+                [12, 15, 16, 40],
+            ]
+        ).astype(np.float32)
+
+        y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+        y_zero_point = make_tensor(
+            "zero_point", TensorProto.UINT4, y_scale.shape, np.ones_like(y_scale)
+        )
+        y = make_tensor(
+            "y", TensorProto.UINT4, x.shape, [1, 2, 3, 5, -1, -1, 3, 4, 4, 5, 5, 11]
+        )
+
+        expect(
+            node,
+            inputs=[x, y_scale, y_zero_point],
+            outputs=[y],
+            name="test_quantizelinear_uint4",
+        )
+
+    @staticmethod
+    def export_int4() -> None:
+        node = onnx.helper.make_node(
+            "QuantizeLinear",
+            inputs=["x", "y_scale", "y_zero_point"],
+            outputs=["y"],
+            axis=0,
+        )
+
+        x = np.array(
+            [
+                [0.0, 2.5, 4.8, 8.6],
+                [-30, -20, 6, 9],
+                [12, 15, 16, 40],
+            ]
+        ).astype(np.float32)
+
+        y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+        y_zero_point = make_tensor(
+            "zero_point", TensorProto.INT4, y_scale.shape, np.ones_like(y_scale)
+        )
+        y = make_tensor(
+            "y", TensorProto.INT4, x.shape, [1, 2, 3, 5, -8, -6, 3, 4, 4, 5, 5, 7]
+        )
+
+        expect(
+            node,
+            inputs=[x, y_scale, y_zero_point],
+            outputs=[y],
+            name="test_quantizelinear_int4",
+        )
diff --git a/onnx/backend/test/data/node/test_cast_BFLOAT16_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_BFLOAT16_to_FLOAT/model.onnx
index da617fe26f4a2c0ebb020fbc2caf457f16b851c7..cbc42505cbf20bfa6d275888ba22937e1f1224d6 100644
GIT binary patch
delta 8
PcmeBT>|&hI$tVf{3z7mx

delta 8
PcmeBT>|&hI$tVl}3y=at

diff --git a/onnx/backend/test/data/node/test_cast_DOUBLE_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_DOUBLE_to_FLOAT/model.onnx
index 90ac59d914964be06dc071c822778a6b5e0a37f0..ebc2159c76de8314767921279da2cefef16bdabe 100644
GIT binary patch
delta 8
PcmeBR>|mVG&L|213vdEJ

delta 8
PcmeBR>|mVG&L|833vL2F

diff --git a/onnx/backend/test/data/node/test_cast_DOUBLE_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_DOUBLE_to_FLOAT16/model.onnx
index f0b4065b0707c08a14430de4e5a3775ea95ed1db..d01a5686d26ea8a25cd596c489b35e0769d89cae 100644
GIT binary patch
delta 8
PcmeBT>|&hI$tVf{3z7mx

delta 8
PcmeBT>|&hI$tVl}3y=at

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_DOUBLE/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_DOUBLE/model.onnx
index b5a760ece7fec8a404c43d4570037dffb621dd7b..da8ad2bd517f0959c6a7973270b101ad8ca39883 100644
GIT binary patch
delta 8
PcmeBT>|&hI$tVf{3z7mx

delta 8
PcmeBT>|&hI$tVl}3y=at

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT/model.onnx
index 810e0670f6880c674769309b4c839d6f4f9e6033..ec60832d58aca5ac974857ba0a74e80ab45eda11 100644
GIT binary patch
delta 8
PcmeBV>|~tK!6*s<3xNVd

delta 8
PcmeBV>|~tK!6*y>3x5JZ

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E4M3FN/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E4M3FN/model.onnx
index 1d9af3921942cf5f5d4e9ef6db0314e852db1e0d..e317066bb7cf556e56a2566a66e8abdc62ee4b41 100644
GIT binary patch
delta 8
PcmbQhIDv6OKcgrB3-|(9

delta 8
PcmbQhIDv6OKcg@J3-$t5

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E4M3FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E4M3FNUZ/model.onnx
index 38833c10233c38f54b6610989aedaec04ea468be..2d37afd2f2521a70a2645184be8ee2d22e83f972 100644
GIT binary patch
delta 8
PcmbQlIEit>L`G2n3>pGn

delta 8
PcmbQlIEit>L`Go%3>X4j

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E5M2/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E5M2/model.onnx
index 260eb119d502a7413cd7f6900c358435363ea608..53c976ff2dc9317807f5bf372c98bc7a7b26c740 100644
GIT binary patch
delta 8
PcmeBU>|>nJ%P0x}3)TWs

delta 8
PcmeBU>|>nJ%P0&03)BKo

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E5M2FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_FLOAT8E5M2FNUZ/model.onnx
index 431483be3ce444572f8326fdd2944e382f374eae..3d845b15f21c86203cfb19ec5007bae4e3731162 100644
GIT binary patch
delta 8
PcmbQlIEit>L`G2n3>pGn

delta 8
PcmbQlIEit>L`Go%3>X4j

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..931c37c45c76c4619ff60e1ea0dd5d3b36ba2c22
GIT binary patch
literal 136
zcmd<!6yixrOwLZtOVKS!EiSPt;8NgX&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5@HJ&
znS>-kD&v!ZqVaA%{*EDrX7MHY@t%GmCQ;%r&3s%u99%*iTudCSAe<z@1vO3xBqN59
Labn?O5D*0b!?7Of

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e0cf2a6f6dde448ba110abc9756930c27ad8ab33
GIT binary patch
literal 83
zcmV~$I|_g>07OC8B_LLw!(&)^q?E6)kPt%R_o|l83}Fb<IIlEQO)=RdvzyIezP=-Z
X*u*Z9$fAfUnmEKME^&)Tyjm20H5?lk

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..01ac23692ca
--- /dev/null
+++ b/onnx/backend/test/data/node/test_cast_FLOAT16_to_INT4/test_data_set_0/output_0.pb
@@ -0,0 +1 @@
+*1������������������������������������!CewwwwBy
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..edb90ceb9ec056ff175590de9bc53d66c71701a1
GIT binary patch
literal 137
zcmd<!6yixrOwLZtOVKS!EiSPt<Wk^b&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5~2$j
znS`W3D&v!ZqVaA%{*EDrX7MHY@u8l6Atq7cFx`AyJRDp?99&EstRS2u!38x>2qYtl
Mka1$+Vh|7o0Pf%)KL7v#

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e0cf2a6f6dde448ba110abc9756930c27ad8ab33
GIT binary patch
literal 83
zcmV~$I|_g>07OC8B_LLw!(&)^q?E6)kPt%R_o|l83}Fb<IIlEQO)=RdvzyIezP=-Z
X*u*Z9$fAfUnmEKME^&)Tyjm20H5?lk

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT16_to_UINT4/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..3278bf25b21c8aa3f7d274ea231b3cbd1e6f0dc4
GIT binary patch
literal 28
hcmd;J<zN*M)e>X?0Y&H3cE**Ary1We@;fnB0st^y1p5F0

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FNUZ_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FNUZ_to_FLOAT/model.onnx
index 80019e9506dbe2132c65a19c98584650f1064759..19d1192514e39005a1dfbd475d7cf84e79b84e4b 100644
GIT binary patch
delta 8
PcmbQhIDv6OKcgrB3-|(9

delta 8
PcmbQhIDv6OKcg@J3-$t5

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FNUZ_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FNUZ_to_FLOAT16/model.onnx
index 3293a43f146219029f7a0b37d7d1bffce9018a0e..407b517d479d1245abe66e2aa0bee4f84d54ef05 100644
GIT binary patch
delta 8
PcmbQlIEit>L`G2n3>pGn

delta 8
PcmbQlIEit>L`Go%3>X4j

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FN_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FN_to_FLOAT/model.onnx
index bd95aee4346a3e0e436ff92f5f4b5a9bcf5deef3..65c54fc5ecbb2ce80ef87b1037e9ac4d927bcc56 100644
GIT binary patch
delta 8
PcmeBU>|>nJ%P0x}3)TWs

delta 8
PcmeBU>|>nJ%P0&03)BKo

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FN_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E4M3FN_to_FLOAT16/model.onnx
index 83ab3d1eaddce0220959a126eb80e8e57b00ccac..d83e1afce24c45f4b12bec71f42f7b7cbd248803 100644
GIT binary patch
delta 8
PcmbQhIDv6OKcgrB3-|(9

delta 8
PcmbQhIDv6OKcg@J3-$t5

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E5M2FNUZ_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E5M2FNUZ_to_FLOAT/model.onnx
index fb708334c8cb0c61317e734d8e7eabddd288dc3b..f5abb9278c5cf0f9f151bf1d2458ea3e0984ddec 100644
GIT binary patch
delta 8
PcmbQhIDv6OKcgrB3-|(9

delta 8
PcmbQhIDv6OKcg@J3-$t5

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E5M2FNUZ_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E5M2FNUZ_to_FLOAT16/model.onnx
index caba512d6fc0c1147384a17782871b246cd2d078..9d8d23681a6997788de59adbf83521bd8a37501c 100644
GIT binary patch
delta 8
PcmbQlIEit>L`G2n3>pGn

delta 8
PcmbQlIEit>L`Go%3>X4j

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E5M2_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E5M2_to_FLOAT/model.onnx
index 9d87dc9a74e8a62c01f98defeaec0f330fe34e73..8aa06428088240cca384c591c329774610ffd667 100644
GIT binary patch
delta 8
PcmeBS>|vbH%_s^03$y}E

delta 8
PcmeBS>|vbH%_s~23$g-A

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT8E5M2_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT8E5M2_to_FLOAT16/model.onnx
index dcc2bf5ad736e02737acef9974dd55b9db4a552e..e766a07b3f5790f848ec0050951cf50fcc525619 100644
GIT binary patch
delta 8
PcmeBU>|>nJ%P0x}3)TWs

delta 8
PcmeBU>|>nJ%P0&03)BKo

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_BFLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_BFLOAT16/model.onnx
index d86063317c3040666b38288c936fd33201fd38a0..1f52f938335dbc40e00da6a07cde5319c71aaa0b 100644
GIT binary patch
delta 8
PcmeBT>|&hI$tVf{3z7mx

delta 8
PcmeBT>|&hI$tVl}3y=at

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_DOUBLE/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_DOUBLE/model.onnx
index 051319ee68a0c40f811f94f901f5143c4f065b3e..cb1c648ab32d5c41ce26029e6d171e2dde1befb8 100644
GIT binary patch
delta 8
PcmeBR>|mVG&L|213vdEJ

delta 8
PcmeBR>|mVG&L|833vL2F

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT16/model.onnx
index e9b3ed56bcc26539f1a046da499c5f6ac4ead5df..a94aa02f20d3a513dfde0d61f2268619930397c6 100644
GIT binary patch
delta 8
PcmeBV>|~tK!6*s<3xNVd

delta 8
PcmeBV>|~tK!6*y>3x5JZ

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E4M3FN/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E4M3FN/model.onnx
index 29341e21b83005df3952e736a853a0ea68d01d31..6be50cdab294913892632cb59109ed1c8306c8df 100644
GIT binary patch
delta 8
PcmeBU>|>nJ%P0x}3)TWs

delta 8
PcmeBU>|>nJ%P0&03)BKo

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E4M3FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E4M3FNUZ/model.onnx
index 5e31b781301a335e3b45f4ead8a743647810e434..a3666d9e26c1351a51f390c73ed200993a10983b 100644
GIT binary patch
delta 8
PcmbQhIDv6OKcgrB3-|(9

delta 8
PcmbQhIDv6OKcg@J3-$t5

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E5M2/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E5M2/model.onnx
index ad53b0951741c5f394d1efd1f0653c6aba80b044..f9c1edff4560dbd0a7b4f9e6bab381aedb256d00 100644
GIT binary patch
delta 8
PcmeBS>|vbH%_s^03$y}E

delta 8
PcmeBS>|vbH%_s~23$g-A

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E5M2FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_FLOAT8E5M2FNUZ/model.onnx
index 015b15fc7ad7e22dc4d909642db68ffbbf32e986..adf6aabae8ce69a206bba6acc1fb295bf6f98133 100644
GIT binary patch
delta 8
PcmbQhIDv6OKcgrB3-|(9

delta 8
PcmbQhIDv6OKcg@J3-$t5

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..6330b76dbd92fc879a3c5cfe07cecd4ce77fa4e1
GIT binary patch
literal 134
zcmd<!6yixrOwLZtOVKS!EiSRj<5J*a&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5@HJ&
znS{haD&v!ZqVaA%{*EE>CHe85ejz4N;xMgzTs$0%LL6L79IPOmB*6tWOb8?+hLCY$
J;bIUF1ps}89(Di#

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..7b2bd50e2f2279e285775cb4aef4053c27a1e48f
GIT binary patch
literal 111
zcmWm6F%f_;429v>p?4iCfkOcfNKt?U1_UrNMoOI!oKy-wx%VY+O35YHG@nv^C1HdV
e=x`YH_lL{)$}$2+M_@pWKbWw<u;akhwse2qofO3Y

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..01ac23692ca
--- /dev/null
+++ b/onnx/backend/test/data/node/test_cast_FLOAT_to_INT4/test_data_set_0/output_0.pb
@@ -0,0 +1 @@
+*1������������������������������������!CewwwwBy
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_STRING/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_STRING/model.onnx
index bcad6763f876e1b32f49ce72e3acc451e0b95bd6..60c33753b0dc6f0a312625326d0aa34d00dbd67f 100644
GIT binary patch
delta 8
PcmeBR>|mVG&L|213vdEJ

delta 8
PcmeBR>|mVG&L|833vL2F

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/model.onnx b/onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..ccb65b05f6c5cd3290ddf8778fc1f32f9e3514c0
GIT binary patch
literal 135
zcmd<!6yixrOwLZtOVKS!EiSRj=ThKe&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5~2$j
znS>-jD&v!ZqVaA%{*EE>CHe88o_--FQQ|PYd|W&nj6xh-OdPBroFu^oHBAU4BZ`o5
KV&P&C5Cs6JM;^-n

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..7b2bd50e2f2279e285775cb4aef4053c27a1e48f
GIT binary patch
literal 111
zcmWm6F%f_;429v>p?4iCfkOcfNKt?U1_UrNMoOI!oKy-wx%VY+O35YHG@nv^C1HdV
e=x`YH_lL{)$}$2+M_@pWKbWw<u;akhwse2qofO3Y

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_FLOAT_to_UINT4/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..3278bf25b21c8aa3f7d274ea231b3cbd1e6f0dc4
GIT binary patch
literal 28
hcmd;J<zN*M)e>X?0Y&H3cE**Ary1We@;fnB0st^y1p5F0

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..7da60f7685fc43c851c953aeee1034ac930c9cfe
GIT binary patch
literal 134
zcmd<!6yixrOwLZtOVKS!EiSRj<5J*a&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5{wHN
znS{haD&v!ZqVb-7Atv!9`SET({*ED0;xMgzTs$0NLL6L79IPOmB*6tWOb8^yh>&q&
J;bIUF1ps)V9$^3g

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..e78a0f1d352
--- /dev/null
+++ b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+*1������������������������������������!CewwwwBx
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..4ce54a941a4cd14644649e653e5b68e2620cd3d5
GIT binary patch
literal 111
zcmd;J<zN+HR7zoBU^ocD4-POe901}4K->Vt4nWKRV($miK+s?hBpiSUh#P=-0T3Sm
L;s*q=6JsR+MhzM6

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..4e6e478294d3af131de098e30e3a0248b1b4a5e7
GIT binary patch
literal 136
zcmd<!6yixrOwLZtOVKS!EiSPt;8NgX&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5?l)y
znS>-kD&v!ZqVb-7Atv!9`SET({*EDrW>MlW&3s%u9AZKoTudCSAe<z@1vO3xB*TS}
Labn?O5D*0b!x<jl

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..e78a0f1d352
--- /dev/null
+++ b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+*1������������������������������������!CewwwwBx
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_INT4_to_FLOAT16/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..8df57ce77bf8e88e646725182c06cc0044027a1f
GIT binary patch
literal 83
zcmd;J<zN-y((-7Szzjlt%nd!v4PDF)9n1|a%nc394L_I|8Y&tZ7#ms`8#)*px)>XJ
M7#sR<A}7X505Yc<>i_@%

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_INT8/model.onnx b/onnx/backend/test/data/node/test_cast_INT4_to_INT8/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..9e9b11c7e1daae391e2e5a709030f4c77bf1da6f
GIT binary patch
literal 133
zcmd<!6yixrOwLZtOVKS!EiSRj<x=2c&CDw(EfHeNFD(JmN-WNa#U)ytTudeT63h!2
znS{hZD&v!ZqVb-7Atv!9`5@XNN*tz>kBf&xOo)SviGvk{lO(vHW(k30m=Q8gEL;o%
Fq5w~89uNQk

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_INT8/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_INT4_to_INT8/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..e78a0f1d352
--- /dev/null
+++ b/onnx/backend/test/data/node/test_cast_INT4_to_INT8/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+*1������������������������������������!CewwwwBx
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_cast_INT4_to_INT8/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_INT4_to_INT8/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..f156e06836d64ba1cff6a9bc56d0189de7ca62c9
GIT binary patch
literal 117
zcmd;J<zN+H*2?<v9|{-|te<ePUvSoMIO`9b^%u_i2WP<;{}~w=nV4Bv+1O#giLnv@
D%-T|C

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_STRING_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_STRING_to_FLOAT/model.onnx
index d6ed67c4d65fdf7ac839e7f38231fdfeeb7fbe39..19e5fe268ad74328fa0d8dfb3429400ce8355184 100644
GIT binary patch
delta 8
PcmeBR>|mVG&L|213vdEJ

delta 8
PcmeBR>|mVG&L|833vL2F

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..f00870c34ba82e949f8131b2221d869463254a9b
GIT binary patch
literal 135
zcmd<!6yixrOwLZtOVKS!EiSRj=ThKe&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5{wHN
znS>-jD&v!ZqVb`gejz6DCHe7gKK_m&QQ|PYd|W&nqCy;8OdPBroFu^oHBAU4!-$Y^
KV&P&C5Cs6EaUQn-

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..9953aa8dbbbf80edfac021acd0369bf99a172808
GIT binary patch
literal 28
hcmd;J<zN*M)e>X?0Y&H3cE**Ary1We@;fnB001yv1o{8~

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..153e831cc6ca8d26c93aa157aa7c37236d3f97eb
GIT binary patch
literal 111
zcmd;J<zN+HR7znW8Z_7gl{x?s5H|qv0w6vB#1DX&!I6PM0EiWU*Z_zffH(k%6M(qD
I(TTAV0C^z_bpQYW

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..3934ccf86952201a3ce85fcc75ee9d31644fa076
GIT binary patch
literal 137
zcmd<!6yixrOwLZtOVKS!EiSPt<Wk^b&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5?l)y
znS`W3D&v!ZqVb`gejz6DCHe7gKK_m&hGtRXFx`AyJRG7z99&EstRS2u!38x>2qeRW
Mka1$+Vh|7o0Pdt8Gynhq

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..9953aa8dbbbf80edfac021acd0369bf99a172808
GIT binary patch
literal 28
hcmd;J<zN*M)e>X?0Y&H3cE**Ary1We@;fnB001yv1o{8~

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_UINT4_to_FLOAT16/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..6f4a59e58e63944dfb5cf64b038895a2d0ff554b
GIT binary patch
literal 65
zcmX}ZyA6OK06;-sK|-vY!vPq;skD2Ag)xNq0bSS5yBJ~^`Z0AsXT=pa+%aRp15dp0
L#)=Iee3@82zpfD@

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/model.onnx b/onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..046a7ba7174340115324d33c0431372ec3c36b47
GIT binary patch
literal 135
zcmd<!6yixrOwLZtOVKS!EiSRj=ThKe&CDw(EfHeNFD(JmN-WNa#U)ytTudeT5=;vi
znS>-jD&v!ZqVb`gejz6DCHY{6MU*&9FCP~Vho}$-7ZV372q#H!K}{0^$uJ>goLIOR
G1VjO(ksiAM

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..9953aa8dbbbf80edfac021acd0369bf99a172808
GIT binary patch
literal 28
hcmd;J<zN*M)e>X?0Y&H3cE**Ary1We@;fnB001yv1o{8~

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_cast_UINT4_to_UINT8/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e040f478727c562bd15b82525c6a83faa0c00abc
GIT binary patch
literal 36
jcmd;J<zN+H(voC=14brh7FITP4o)s^9$r3vC&o$u68Hh6

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E4M3FN/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E4M3FN/model.onnx
index b9ec0ed92a204759a016cf48d0bd64bab09c311c..f3a9f82553acfc7253c3ec65e15ea128a0edff88 100644
GIT binary patch
delta 8
PcmZ3-xQ=ncT1HU-4fX=E

delta 8
PcmZ3-xQ=ncT1H_24fF!A

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E4M3FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E4M3FNUZ/model.onnx
index 82616d5c88911bb3e5c7f4107908f33f8538aa13..d220b7832cea26d751d7017e3287cb01459c5e96 100644
GIT binary patch
delta 8
PcmdnMxPfuPdPY$I4j2Ns

delta 8
PcmdnMxPfuPdPZRY4i*Bo

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E5M2/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E5M2/model.onnx
index f1df847d26689de504c8856d82226ddcf0a4389c..bea87e304b0d78fbd2b02b6807dd2778ef98f050 100644
GIT binary patch
delta 8
PcmZ3(xQ21UYDQ524b%dx

delta 8
PcmZ3(xQ21UYDQrI4blRt

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E5M2FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT16_to_FLOAT8E5M2FNUZ/model.onnx
index e2fa939ff8548d339e5da0f0f0c9406dc9883468..c72bf1e250a99e9b08bd9436eaab6dfca5df2913 100644
GIT binary patch
delta 8
PcmdnMxPfuPdPY$I4j2Ns

delta 8
PcmdnMxPfuPdPZRY4i*Bo

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E4M3FN/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E4M3FN/model.onnx
index 2bbb8cd0e505d51c73b64d6fbcdd7cdebafb6f67..fd6a12917b378f827d17f8832651c6698130bca3 100644
GIT binary patch
delta 8
PcmZ3(xQ21UYDQ524b%dx

delta 8
PcmZ3(xQ21UYDQrI4blRt

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E4M3FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E4M3FNUZ/model.onnx
index e7464861ab642ce76cbce24972f5f9fad0b5d087..2db2b726ac48c9b0cae5fa303134e3a9bf59d9c0 100644
GIT binary patch
delta 8
PcmZ3-xQ=ncT1HU-4fX=E

delta 8
PcmZ3-xQ=ncT1H_24fF!A

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E5M2/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E5M2/model.onnx
index 39743629a5ab3d72621f7ed1b6198302473feed4..6d527345d7f231a22563576f96a934eb1c2b4882 100644
GIT binary patch
delta 8
PcmZ3*xQcPYN=8ut4YC5J

delta 8
PcmZ3*xQcPYN=9J-4X^^F

diff --git a/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E5M2FNUZ/model.onnx b/onnx/backend/test/data/node/test_cast_no_saturate_FLOAT_to_FLOAT8E5M2FNUZ/model.onnx
index b26fbe3128d3f50c03527ff89b7936c5649d52af..0e09083564f8d9784681ecc8c698299601931ed0 100644
GIT binary patch
delta 8
PcmZ3-xQ=ncT1HU-4fX=E

delta 8
PcmZ3-xQ=ncT1H_24fF!A

diff --git a/onnx/backend/test/data/node/test_castlike_BFLOAT16_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_BFLOAT16_to_FLOAT/model.onnx
index 037710cfa8c4886965f4f85701d6dd44d1c1077a..e9ab3c595947d0e6ea61246156a6c4bc45964384 100644
GIT binary patch
delta 8
PcmZ3)xQKDWLPk*l4JrbT

delta 8
PcmZ3)xQKDWLPlW#4JZPP

diff --git a/onnx/backend/test/data/node/test_castlike_BFLOAT16_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_BFLOAT16_to_FLOAT_expanded/model.onnx
index 3d00c24326def0240f4bc4e6ab1f23d5e1f898e6..8320cee2da5314e42c72416c898a94239b09235d 100644
GIT binary patch
delta 8
PcmdnbxSw&tK1NXh4;KRE

delta 8
PcmdnbxSw&tK1N{x4;2FA

diff --git a/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT/model.onnx
index b6326758e7053d81be7e7f53bc1ca18b0de531b6..13a97634a9e54456a6e59921579c9c2917a2cf84 100644
GIT binary patch
delta 8
PcmZ3$xPWoOd`3|K4G02=

delta 8
PcmZ3$xPWoOd`4ja4F&>+

diff --git a/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT16/model.onnx
index 6e813d486e92c95577bdd5a689047c81ca1736ab..439b5c0ab4c4ca7e777c62e614cd1a4d171f4c20 100644
GIT binary patch
delta 8
PcmZ3)xQKDWLPk*l4JrbT

delta 8
PcmZ3)xQKDWLPlW#4JZPP

diff --git a/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT16_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT16_expanded/model.onnx
index 3dd10cf93d2cfa7f452e65348ac1e57dfea45b1c..eb617419b1f9ad88ec3f792c8e17d765c5bc4858 100644
GIT binary patch
delta 8
PcmdnbxSw&tK1NXh4;KRE

delta 8
PcmdnbxSw&tK1N{x4;2FA

diff --git a/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_DOUBLE_to_FLOAT_expanded/model.onnx
index 61436f85d70e9dcec13e3851ecd9339bf7db0b87..6c4b45e396875bf3a677b36b391765b5e143b1cf 100644
GIT binary patch
delta 8
PcmdnXxR-Il9!60B4)p@x

delta 8
PcmdnXxR-Il9!6mR4)X%t

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT16_to_DOUBLE/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT16_to_DOUBLE/model.onnx
index 8abef62c50f8a5008776b77ba1c26d6b5e45ef12..dbf14bacb6e18c61507810be17acb9a9216b1697 100644
GIT binary patch
delta 8
PcmZ3)xQKDWLPk*l4JrbT

delta 8
PcmZ3)xQKDWLPlW#4JZPP

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT16_to_DOUBLE_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT16_to_DOUBLE_expanded/model.onnx
index 278c739ba4528430d6041bbb7588639dee6ef24f..a60e73ea84a396a617262f199afd3f8733550529 100644
GIT binary patch
delta 8
PcmdnbxSw&tK1NXh4;KRE

delta 8
PcmdnbxSw&tK1N{x4;2FA

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT16_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT16_to_FLOAT/model.onnx
index e85e0a13b96a07722198f13a2bbf00c4eff66216..8090e551f3f121db7a1674f584f63bfe51c97023 100644
GIT binary patch
delta 8
PcmZ3;xR7ze0!C2)4H*K9

delta 8
PcmZ3;xR7ze0!Co~4Hp85

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT16_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT16_to_FLOAT_expanded/model.onnx
index 4983a0f201813c95390255c0a89f675ad484715e..43140a71f5622aad77a001b232ebe774d4f55682 100644
GIT binary patch
delta 8
PcmdnTxQ}tdUPe&>4+a9_

delta 8
PcmdnTxQ}tdUPfU64+H|>

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FNUZ_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FNUZ_to_FLOAT/model.onnx
index 309c75d815339c46d66a6a024b153491f9377c52..5e63156d3ace52dfd86d74674070f967fe19db06 100644
GIT binary patch
delta 8
PcmZ3%xPo!Qaz;@A4Uht$

delta 8
PcmZ3%xPo!Qaz<eQ4UPhy

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FNUZ_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FNUZ_to_FLOAT_expanded/model.onnx
index e8999df2fa602783de0598141c4ac05a1de1c328..54478a8090a9154081f33199dbe3a65c3ef02ef4 100644
GIT binary patch
delta 8
PcmX@gc$9I%5k^q}4}Ajn

delta 8
PcmX@gc$9I%5k_GE4|@Xj

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FN_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FN_to_FLOAT/model.onnx
index 890e74206a093c000911d53cb9a05824cd793953..950a476fbb113c6e46cbec33b64143e073e542f9 100644
GIT binary patch
delta 8
PcmZ3+xQubaQbth#4Q>LO

delta 8
PcmZ3+xQubaQbu6_4Qv9K

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FN_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E4M3FN_to_FLOAT_expanded/model.onnx
index 07052374573c3d0c54ee4fb588e8482d06c5c25c..a7c88c75a18d5d1d6f12a166933ff7e89bee0316 100644
GIT binary patch
delta 8
PcmX@ic$jg*Ax2RE4_gB9

delta 8
PcmX@ic$jg*Ax2>U4_N~5

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2FNUZ_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2FNUZ_to_FLOAT/model.onnx
index 7b052928e81c4cfd1751dd7bd0f98f5709b5d62b..c6de364d2aae91dd4e3876e245d310491fbaf640 100644
GIT binary patch
delta 8
PcmZ3%xPo!Qaz;@A4Uht$

delta 8
PcmZ3%xPo!Qaz<eQ4UPhy

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2FNUZ_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2FNUZ_to_FLOAT_expanded/model.onnx
index 5b43fdb7abca8125bbfaccd5a795fe6df4d0bf78..a81165bcf16ac7c7320571f7b733b6a2c764aac2 100644
GIT binary patch
delta 8
PcmX@gc$9I%5k^q}4}Ajn

delta 8
PcmX@gc$9I%5k_GE4|@Xj

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2_to_FLOAT/model.onnx
index f8d2890ddaacc4eb650b4e285861783744091d05..37cb2410c7a98a764ab6d8d0ba0aff51100873ed 100644
GIT binary patch
delta 8
PcmZ3&xP)=SVn$H_4NL-*

delta 8
PcmZ3&xP)=SVn$&A4N3x%

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT8E5M2_to_FLOAT_expanded/model.onnx
index 7adf7bb4bba2409de8b1f22d6a04cee08eebc1f8..86464a241cdf14b38dee51d227cf429f0ea06860 100644
GIT binary patch
delta 8
PcmX@ec#v_z0Y*^(4><zs

delta 8
PcmX@ec#v_z0Y+f}4>tno

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_BFLOAT16/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_BFLOAT16/model.onnx
index d36b64fa8b302acbb798c6537bf734770297ec19..e9c27a459e3f0af6cc17f2d58268cb798f6c8054 100644
GIT binary patch
delta 8
PcmZ3)xQKDWLPk*l4JrbT

delta 8
PcmZ3)xQKDWLPlW#4JZPP

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_BFLOAT16_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_BFLOAT16_expanded/model.onnx
index c70c1b83050cb11b58cd15b5874cf2dd30c95f26..a07f3ffb96a53fc7c688bebf9136a0c2a6798d51 100644
GIT binary patch
delta 8
PcmdnbxSw&tK1NXh4;KRE

delta 8
PcmdnbxSw&tK1N{x4;2FA

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_DOUBLE/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_DOUBLE/model.onnx
index 6c213998b03ece28357f183132283645b625f40c..7b3bbbc2a3dc2f4f568b4521091b358cca1514fa 100644
GIT binary patch
delta 8
PcmZ3$xPWoOd`3|K4G02=

delta 8
PcmZ3$xPWoOd`4ja4F&>+

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_DOUBLE_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_DOUBLE_expanded/model.onnx
index 9d194c8eb6b849b9327bf1e3ef4f913a38e99b3a..c58a49647be4878c2f8e1fb52734368b8bc512f7 100644
GIT binary patch
delta 8
PcmdnXxR-Il9!60B4)p@x

delta 8
PcmdnXxR-Il9!6mR4)X%t

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT16/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT16/model.onnx
index 0d9347d15b3073ab8d59bd9fa6d47e1aff701dae..05a7a8a41db8bc2d10d639be593011a3b6671eda 100644
GIT binary patch
delta 8
PcmZ3;xR7ze0!C2)4H*K9

delta 8
PcmZ3;xR7ze0!Co~4Hp85

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT16_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT16_expanded/model.onnx
index 3b2ec70c5b85f8a8a8170740d261471f6a65bf1b..2c56955d272ac3864716cac1ae183da21e15345c 100644
GIT binary patch
delta 8
PcmdnTxQ}tdUPe&>4+a9_

delta 8
PcmdnTxQ}tdUPfU64+H|>

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FN/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FN/model.onnx
index ec923ff7db1042be90f6d6ec9c101e50387037f9..e52850950382b778833f592d8e5c860784a8e2c3 100644
GIT binary patch
delta 8
PcmZ3+xQubaQbth#4Q>LO

delta 8
PcmZ3+xQubaQbu6_4Qv9K

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FNUZ/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FNUZ/model.onnx
index 6a90db451cc669fe144154b96af1e7c8bb5bc0a2..df179a8b18306287dd0df954131c8ea2d58939eb 100644
GIT binary patch
delta 8
PcmZ3%xPo!Qaz;@A4Uht$

delta 8
PcmZ3%xPo!Qaz<eQ4UPhy

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FNUZ_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FNUZ_expanded/model.onnx
index 002b9a5a1b6ed4378db99c7780bff800eeb21947..4691125f2d84041ab8921865d75bf99c73bd5b01 100644
GIT binary patch
delta 8
PcmX@gc$9I%5k^q}4}Ajn

delta 8
PcmX@gc$9I%5k_GE4|@Xj

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FN_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E4M3FN_expanded/model.onnx
index e8cbe1c146233cedc5621bb0e2301df7a55b56fa..98a4aaafd59b83645fd6d0a2f6d492e5bcabd3a7 100644
GIT binary patch
delta 8
PcmX@ic$jg*Ax2RE4_gB9

delta 8
PcmX@ic$jg*Ax2>U4_N~5

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2/model.onnx
index aef2c4b2cbb3d36940492bc1fd682320c6420995..5e7121c64fdd005ae645f1496b27987ad21633c1 100644
GIT binary patch
delta 8
PcmZ3&xP)=SVn$H_4NL-*

delta 8
PcmZ3&xP)=SVn$&A4N3x%

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2FNUZ/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2FNUZ/model.onnx
index e801dcd18d6d79cb4527deb110210ea1627bbd42..00f4adb1527c52938024e2625d7a27032162f380 100644
GIT binary patch
delta 8
PcmZ3%xPo!Qaz;@A4Uht$

delta 8
PcmZ3%xPo!Qaz<eQ4UPhy

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2FNUZ_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2FNUZ_expanded/model.onnx
index 4dec58796cc7e3795e1484f77d097f63deaf77a0..ec9dd96a8247ce37e74fe59b49f5c233c31e5b26 100644
GIT binary patch
delta 8
PcmX@gc$9I%5k^q}4}Ajn

delta 8
PcmX@gc$9I%5k_GE4|@Xj

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_FLOAT8E5M2_expanded/model.onnx
index bbf1a3f0203f8fb49e3c8ec091b8435382772218..ff1b86c1ebc9a4143ee1e76160023e769988b60a 100644
GIT binary patch
delta 8
PcmX@ec#v_z0Y*^(4><zs

delta 8
PcmX@ec#v_z0Y+f}4>tno

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_STRING/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_STRING/model.onnx
index 9045e819d4bb46d0875cae2f5650bde95011e2e6..fa0c56365da8a30079d5a6cae5244a9e94fa6fca 100644
GIT binary patch
delta 8
PcmZ3$xPWoOd`3|K4G02=

delta 8
PcmZ3$xPWoOd`4ja4F&>+

diff --git a/onnx/backend/test/data/node/test_castlike_FLOAT_to_STRING_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_FLOAT_to_STRING_expanded/model.onnx
index 592e3101602288a94f0b9ef28091cd982064f78f..331ae5208fa925b35a1b5e3b66b29a1332a90a57 100644
GIT binary patch
delta 8
PcmdnXxR-Il9!60B4)p@x

delta 8
PcmdnXxR-Il9!6mR4)X%t

diff --git a/onnx/backend/test/data/node/test_castlike_STRING_to_FLOAT/model.onnx b/onnx/backend/test/data/node/test_castlike_STRING_to_FLOAT/model.onnx
index 4445d4a87193a2472664dfaca557a67f9e8d406c..e53fbaaf92915ff0a3c27465e6f2954f06153281 100644
GIT binary patch
delta 8
PcmZ3$xPWoOd`3|K4G02=

delta 8
PcmZ3$xPWoOd`4ja4F&>+

diff --git a/onnx/backend/test/data/node/test_castlike_STRING_to_FLOAT_expanded/model.onnx b/onnx/backend/test/data/node/test_castlike_STRING_to_FLOAT_expanded/model.onnx
index 0b24bd52ae5bfacebee2a0653103adcffc8a3d52..4bd7014da2053ada46be14bca7911bb13e6fd171 100644
GIT binary patch
delta 8
PcmdnXxR-Il9!60B4)p@x

delta 8
PcmdnXxR-Il9!6mR4)X%t

diff --git a/onnx/backend/test/data/node/test_constant/model.onnx b/onnx/backend/test/data/node/test_constant/model.onnx
index e9ac1b35ab7113f63693fcceaf33a47b8dea9c4c..a869ce3ba5f55bf4e292ec1f6fd9f5b72010a973 100644
GIT binary patch
delta 8
PcmaFB_<(W3eMV6L5l90_

delta 8
PcmaFB_<(W3eMVsb5k><>

diff --git a/onnx/backend/test/data/node/test_constant_pad/model.onnx b/onnx/backend/test/data/node/test_constant_pad/model.onnx
index 899c2e6d0224989f1af6f307e42e8701b3265c6a..4ad1daf82b4bc3f246ff0ff6611c3e121fb580dd 100644
GIT binary patch
delta 8
PcmdnRxQlVZPDW7x4#EP~

delta 8
PcmdnRxQlVZPDWt>4!{D`

diff --git a/onnx/backend/test/data/node/test_constant_pad_axes/model.onnx b/onnx/backend/test/data/node/test_constant_pad_axes/model.onnx
index 6f6683d1e13ce1a1c70fe752773cc7c10b65789e..3e4b97f4c4a72dbedb96ead83c04cd1ccfea30fd 100644
GIT binary patch
delta 8
Pcmcb~c$0C$4MtG_5YYoO

delta 8
Pcmcb~c$0C$4Mt%A5YGcK

diff --git a/onnx/backend/test/data/node/test_constant_pad_negative_axes/model.onnx b/onnx/backend/test/data/node/test_constant_pad_negative_axes/model.onnx
index e3d9a882274a5a04f68e9a2f0e82e16aceb40ec4..752530441e14f920d4f6ce3a91080120089e209d 100644
GIT binary patch
delta 8
PcmaFF_=s`BLq<^m5o!ZY

delta 8
PcmaFF_=s`BLq=f$5oiNU

diff --git a/onnx/backend/test/data/node/test_constantofshape_float_ones/model.onnx b/onnx/backend/test/data/node/test_constantofshape_float_ones/model.onnx
index cd8251f84efeeecb5d7d76b2a57a13f4c550f58e..8118b8f978dde49029c78d213bcc3860fe86aa7b 100644
GIT binary patch
delta 8
PcmbQkIEQh<Y(`N448#I_

delta 8
PcmbQkIEQh<Y(^0P48sC@

diff --git a/onnx/backend/test/data/node/test_constantofshape_int_shape_zero/model.onnx b/onnx/backend/test/data/node/test_constantofshape_int_shape_zero/model.onnx
index 9e5e5a668fb87d41b9b968c66a50a4d1a3ce27a1..cad6099b2aa8ba0e0f165b33ef257cd39aa29188 100644
GIT binary patch
delta 8
PcmbQjIE8V-WJXZ{3_Jp4

delta 8
PcmbQjIE8V-WJVDH3_Aj2

diff --git a/onnx/backend/test/data/node/test_constantofshape_int_zeros/model.onnx b/onnx/backend/test/data/node/test_constantofshape_int_zeros/model.onnx
index 554031645a97038341894635643c17e5ec1ef62b..c2ca8cbfefca27fc258f907bf1e5178eda561a2d 100644
GIT binary patch
delta 8
PcmbQtIGJ(6Bt}sH3@ZX*

delta 8
PcmbQtIGJ(6Bt{Vc3@QR(

diff --git a/onnx/backend/test/data/node/test_dequantizelinear_int4/model.onnx b/onnx/backend/test/data/node/test_dequantizelinear_int4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..7100a541b5fe51b5a7d316a9fd5c78723d3065e9
GIT binary patch
literal 196
zcmd<!6yixrOwLZtOVKS!EiSQI&B$fL#aO|`UJ+lMoS2i!#ZwVqm0FY^Uyz@fS0cn%
zsU+Z%T3DKxSCUzk>XVt5npmX8&Bc;fky$Lkuz-<CND5?3d<tAm4p>b*P_s!CKhR_$
yE-nrZF(DQ%CJxppLAX6aY+NiHj6w`iQfM|Jlrts)^;d%RGeYz`v2Za6hynn#&M<NS

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..de4d1b308c8
--- /dev/null
+++ b/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+*���������Bx
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..d0d648004296cc360837f2e18c296aba1350b928
GIT binary patch
literal 17
YcmWe&bYicFFHTO(N%dl3U|?_n03>Jw$N&HU

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_2.pb
new file mode 100644
index 00000000000..27697b35887
--- /dev/null
+++ b/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/input_2.pb
@@ -0,0 +1,2 @@
+*B
+zero_point
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_dequantizelinear_int4/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..51f86b3a2f6ae12bf09152bf2383f4a3f546c46f
GIT binary patch
literal 29
hcmd;J6<~B?tn?CLU|=`^#6aNS$iSd*kbz;sK>#$f1gro6

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_dequantizelinear_uint4/model.onnx b/onnx/backend/test/data/node/test_dequantizelinear_uint4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..a03ffb289b2d346b5fa7260aa83bd32acfe648e4
GIT binary patch
literal 197
zcmd<!6yixrOwLZtOVKS!EiSQI!^maB#aO|`UJ+lMoS2i!#ZwVqm0FY^Uyz@fS0cn%
zsU+Z%T3DKxSCUzk>XVt5npmX8&Bc;fky$Lkuz-<CNE&2Jd<tAm4p>cmDNwgb6hF{t
zAucWs4pAW%E+!7vC_%VILTp?t9E?H?QBr7DB9t>G0rgjc^)o{BJF##v2#5jz0i-bJ

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..1b66879459b
--- /dev/null
+++ b/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_0.pb
@@ -0,0 +1 @@
+*�Bx
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..d0d648004296cc360837f2e18c296aba1350b928
GIT binary patch
literal 17
YcmWe&bYicFFHTO(N%dl3U|?_n03>Jw$N&HU

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_2.pb
new file mode 100644
index 00000000000..8876e08c6db
--- /dev/null
+++ b/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/input_2.pb
@@ -0,0 +1,2 @@
+*B
+zero_point
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_dequantizelinear_uint4/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..6b70d35771e663f9bd91f030fff6ba22e5001246
GIT binary patch
literal 29
gcmd;J6<~B?tn?CLU|=`^#6aNS$iOfGh#xot05tOiYybcN

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_edge_pad/model.onnx b/onnx/backend/test/data/node/test_edge_pad/model.onnx
index a0b34c1b53584da7f3aae6291ce7d3a61c2d8e93..334bd2c7ecd1306807e879343f948fdbe5ea1941 100644
GIT binary patch
delta 8
PcmbQmIE!(@Oh!=v459*d

delta 8
PcmbQmIE!(@Oh#b<44?vZ

diff --git a/onnx/backend/test/data/node/test_flatten_axis0/model.onnx b/onnx/backend/test/data/node/test_flatten_axis0/model.onnx
index 33946692d4923399c85b96ab2c5c85288b9aba17..0bdd26ed87f7f98f4df148d325f092f51533ded7 100644
GIT binary patch
delta 10
Rcmb=aVdCJN$W+NF3IGlp0xkdm

delta 10
Rcmb=aVd7w)$W+P53jhuW0we$c

diff --git a/onnx/backend/test/data/node/test_flatten_axis1/model.onnx b/onnx/backend/test/data/node/test_flatten_axis1/model.onnx
index a983a32152c5d17012d1edb079bb3cf52c63e02c..540edccb5d26cd8742c752ebb61bb476eec83b1f 100644
GIT binary patch
delta 10
Rcmb=aVdCJN$W+NF3IGlp0xkdm

delta 10
Rcmb=aVd7w)$W+P53jhuW0we$c

diff --git a/onnx/backend/test/data/node/test_flatten_axis2/model.onnx b/onnx/backend/test/data/node/test_flatten_axis2/model.onnx
index bef332e6f685fd9378733b783e3faecd22b6396c..5ae79a3ef7ec687c4b95de72082ce7703517e01f 100644
GIT binary patch
delta 10
Rcmb=aVdCJN$W+NF3IGlp0xkdm

delta 10
Rcmb=aVd7w)$W+P53jhuW0we$c

diff --git a/onnx/backend/test/data/node/test_flatten_axis3/model.onnx b/onnx/backend/test/data/node/test_flatten_axis3/model.onnx
index 355fda6ade8536b8cd606b050c41d809cfdf0c36..5d6205389edadbd570cbd39cef649350e04c615e 100644
GIT binary patch
delta 10
Rcmb=aVdCJN$W+NF3IGlp0xkdm

delta 10
Rcmb=aVd7w)$W+P53jhuW0we$c

diff --git a/onnx/backend/test/data/node/test_flatten_default_axis/model.onnx b/onnx/backend/test/data/node/test_flatten_default_axis/model.onnx
index 87104aac186798ba8e0de506deb8053304204386..09db5c1a3b6ff82c102ada9d62962e563e2e4db5 100644
GIT binary patch
delta 10
RcmXRbW8&bP$W+WI3IGk40vrGU

delta 10
RcmXRbW8z?+$W+Y83jhs+0ulfK

diff --git a/onnx/backend/test/data/node/test_flatten_negative_axis1/model.onnx b/onnx/backend/test/data/node/test_flatten_negative_axis1/model.onnx
index 07589929f6f29a3db56aef8380449d797ef696ec..943e4133df7d8cdc165d2b54e8de63c75fc468c9 100644
GIT binary patch
delta 12
TcmeBU>|<o&;GD?R%_s^05oZEs

delta 12
TcmeBU>|<o&V4uj;&BzM?5mN$O

diff --git a/onnx/backend/test/data/node/test_flatten_negative_axis2/model.onnx b/onnx/backend/test/data/node/test_flatten_negative_axis2/model.onnx
index a51a7ca471f088c23c35345085e0c89430f224f5..d61f69d68c723c93c2d6efeb9f01eacfd0e80f96 100644
GIT binary patch
delta 12
TcmeBU>|<o&;GD?R%_s^05oZEs

delta 12
TcmeBU>|<o&V4uj;&BzM?5mN$O

diff --git a/onnx/backend/test/data/node/test_flatten_negative_axis3/model.onnx b/onnx/backend/test/data/node/test_flatten_negative_axis3/model.onnx
index 4b53733491371acd5ec76632006ef6e5e235b6d9..10e5b1bd7775dbba782682fb92dfbc003aba3274 100644
GIT binary patch
delta 12
TcmeBU>|<o&;GD?R%_s^05oZEs

delta 12
TcmeBU>|<o&V4uj;&BzM?5mN$O

diff --git a/onnx/backend/test/data/node/test_flatten_negative_axis4/model.onnx b/onnx/backend/test/data/node/test_flatten_negative_axis4/model.onnx
index 6eb24b381ab511f94c6c22e821f525d3d0c70513..b04e36773f632800e177b568621d939d30a14dcb 100644
GIT binary patch
delta 12
TcmeBU>|<o&;GD?R%_s^05oZEs

delta 12
TcmeBU>|<o&V4uj;&BzM?5mN$O

diff --git a/onnx/backend/test/data/node/test_identity/model.onnx b/onnx/backend/test/data/node/test_identity/model.onnx
index 62aa473acb4b7ba5f3679536a5bfd2404e718c52..30e004d8bc68e89405177264d585f22f022a2798 100644
GIT binary patch
delta 6
NcmXReo>0Un3IGVC0sjC1

delta 6
NcmXReo>0Un3;+nC0sQ~~

diff --git a/onnx/backend/test/data/node/test_identity_sequence/model.onnx b/onnx/backend/test/data/node/test_identity_sequence/model.onnx
index fa0203390719c94312fe30832ec59cedc83028ae..e05b372ad9b38cdb597a4366427628aabfb92132 100644
GIT binary patch
delta 6
NcmYdDm=Mn>3IGS10mc9T

delta 6
NcmYdDm=Mn>3;+k10mJ|R

diff --git a/onnx/backend/test/data/node/test_lrn_default/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_lrn_default/test_data_set_0/output_0.pb
index 3a300d378c3a0baea580276e3ef301483a9b8bf7..99db03951491bb67e7da0e49a8a671794042b67b 100644
GIT binary patch
delta 39
vcmca2d_{PI3uE0z*UyZMdYdhotQi@9ZZ2hxXJ%xZe3k7sBiH7Q?8;046%q{B

delta 39
vcmca2d_{PI3uDbj*UyZMx|=PTtQi@9Y%XPvXJ%xbe3k7sBgf{A?8;046pakf

diff --git a/onnx/backend/test/data/node/test_mvn/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_mvn/test_data_set_0/output_0.pb
index 406697d1a79..b8725733af0 100644
--- a/onnx/backend/test/data/node/test_mvn/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_mvn/test_data_set_0/output_0.pb
@@ -1 +1 @@
-BYJl�d�?�;�>:�ſr����d���>D��>6nQ?��[?:���#rc�vyH�G3U??	����,?�UD?�Pi?�ҿ�o����?��>�2�?�ϗ?��m����=msþ���
\ No newline at end of file
+BYJl�d�?�;�>:�ſr����d���>D��>6nQ?��[?:���#rc�vyH�E3U?>	����,?�UD?�Pi?�ҿ�o����?��>�2�?�ϗ?��m����=msþ���
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_mvn_expanded/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_mvn_expanded/test_data_set_0/output_0.pb
index 406697d1a79..b8725733af0 100644
--- a/onnx/backend/test/data/node/test_mvn_expanded/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_mvn_expanded/test_data_set_0/output_0.pb
@@ -1 +1 @@
-BYJl�d�?�;�>:�ſr����d���>D��>6nQ?��[?:���#rc�vyH�G3U??	����,?�UD?�Pi?�ҿ�o����?��>�2�?�ϗ?��m����=msþ���
\ No newline at end of file
+BYJl�d�?�;�>:�ſr����d���>D��>6nQ?��[?:���#rc�vyH�E3U?>	����,?�UD?�Pi?�ҿ�o����?��>�2�?�ϗ?��m����=msþ���
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_mvn_expanded_ver18/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_mvn_expanded_ver18/test_data_set_0/output_0.pb
index 406697d1a79..b8725733af0 100644
--- a/onnx/backend/test/data/node/test_mvn_expanded_ver18/test_data_set_0/output_0.pb
+++ b/onnx/backend/test/data/node/test_mvn_expanded_ver18/test_data_set_0/output_0.pb
@@ -1 +1 @@
-BYJl�d�?�;�>:�ſr����d���>D��>6nQ?��[?:���#rc�vyH�G3U??	����,?�UD?�Pi?�ҿ�o����?��>�2�?�ϗ?��m����=msþ���
\ No newline at end of file
+BYJl�d�?�;�>:�ſr����d���>D��>6nQ?��[?:���#rc�vyH�E3U?>	����,?�UD?�Pi?�ҿ�o����?��>�2�?�ϗ?��m����=msþ���
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_quantizelinear_e4m3fn/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_quantizelinear_e4m3fn/test_data_set_0/output_0.pb
index f78f6a608bf434d22b007d2183d390b2ddfac759..9bcfde7d4dfdaf8fb4beaa1af4781b11d5a158c3 100644
GIT binary patch
literal 14
Vcmd;J6%f>7WiYU)%W-0?1ON;Y0&D;P

literal 23
ecmd;J6%f>7WiYU)%W>kWN-fHdFUZf#D**sCA_heO

diff --git a/onnx/backend/test/data/node/test_quantizelinear_e5m2/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_quantizelinear_e5m2/test_data_set_0/output_0.pb
index 7a01c47dd3d70d1bc178102a86bd5b6fd6acb8e8..a77a10c7a2fce3ea059434dc2d93bd033724c4fa 100644
GIT binary patch
literal 14
Vcmd;J6%f{9Ww5ZR3Ugwt1ON;S0$>0D

literal 23
ecmd;J6%f{9Ww5ZR3UlJBN-fHdFUZf#D**sBum(8*

diff --git a/onnx/backend/test/data/node/test_quantizelinear_int4/model.onnx b/onnx/backend/test/data/node/test_quantizelinear_int4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..975f53e233f64ebc6413bb2bbe133af5d82d8264
GIT binary patch
literal 204
zcmd<!6yixrOwLZtOVKS!EiSRz%*bWM#aO|`UKwAUoS2i!#Zwtym0FY^Uyz@fS0cn%
zsl*ppnwVFTS(WOOnU|Vaq{Yp}l30;hEWxmVkx57bWJr7=R7DP0MLbZWNt7_qR3Sbt
z9u7t!4lX7RW*}yX5{26%#Kpw{lwtu%MM<IA3X&6p$Rz>IsRWxNhA_v8g^NKz6aba~
BFIxZr

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..f39a3799ad004741f09ad7a8f8813db1f1c674b7
GIT binary patch
literal 59
zcmd;J=3o(EbYiUVGGG7$1&3KPX96*&BLl;SgA5D{fcSs|1A_pN?*POFK->Vt3QhoA
C2MhlI

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..c4cda36fa824d2e68cf94638c3e1f7ae30adc205
GIT binary patch
literal 27
fcmd;J7GQK@uZ%BFPRvR5;$dK5Z~!78Zg2nqNKFMC

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_2.pb
new file mode 100644
index 00000000000..9fe0fc5b846
--- /dev/null
+++ b/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/input_2.pb
@@ -0,0 +1,2 @@
+*B
+zero_point
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..56601395022
--- /dev/null
+++ b/onnx/backend/test/data/node/test_quantizelinear_int4/test_data_set_0/output_0.pb
@@ -0,0 +1 @@
+*!S���������CTuBy
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_quantizelinear_uint4/model.onnx b/onnx/backend/test/data/node/test_quantizelinear_uint4/model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..c8ca55934fe54ff7e747f8a4db41f4e5d5e6d921
GIT binary patch
literal 205
zcmd<!6yixrOwLZtOVKS!EiSRz!pLRC#aO|`UKwAUoS2i!#Zwtym0FY^Uyz@fS0cn%
zsl*ppnwVFTS(WOOnU|Vaq{Yp}l30;hEWxmVkx57rWJr7=R7DP0MSLkxr%9AB&{!cp
zE*=g>Ar3Ak4rU-`i4uidB*ewV0hD3^NkvJaSqqXAg~%lV&8Y;NBZ@G`iG_<nKokJx
C{x8q~

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..f39a3799ad004741f09ad7a8f8813db1f1c674b7
GIT binary patch
literal 59
zcmd;J=3o(EbYiUVGGG7$1&3KPX96*&BLl;SgA5D{fcSs|1A_pN?*POFK->Vt3QhoA
C2MhlI

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_1.pb
new file mode 100644
index 0000000000000000000000000000000000000000..c4cda36fa824d2e68cf94638c3e1f7ae30adc205
GIT binary patch
literal 27
fcmd;J7GQK@uZ%BFPRvR5;$dK5Z~!78Zg2nqNKFMC

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_2.pb b/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_2.pb
new file mode 100644
index 00000000000..412ac47a2c0
--- /dev/null
+++ b/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/input_2.pb
@@ -0,0 +1,2 @@
+*B
+zero_point
\ No newline at end of file
diff --git a/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/output_0.pb b/onnx/backend/test/data/node/test_quantizelinear_uint4/test_data_set_0/output_0.pb
new file mode 100644
index 0000000000000000000000000000000000000000..b257471d216e65f4eadcad49a5c2a572b0abfb8e
GIT binary patch
literal 18
Zcmd;J=3o&J)nZo+W^fMK%IL&c2>=q80_y+(

literal 0
HcmV?d00001

diff --git a/onnx/backend/test/data/node/test_reflect_pad/model.onnx b/onnx/backend/test/data/node/test_reflect_pad/model.onnx
index d81c9c098b4c7a00cbb233b56a5f62b4a53abd4c..cdbf9843499b4f0752df5b5e54302480642634e3 100644
GIT binary patch
delta 8
PcmZ3$xPWoOd`3|K4G02=

delta 8
PcmZ3$xPWoOd`4ja4F&>+

diff --git a/onnx/backend/test/data/node/test_reshape_allowzero_reordered/model.onnx b/onnx/backend/test/data/node/test_reshape_allowzero_reordered/model.onnx
index 16684fbcf9c53d6c848b6bcb0a0e394f51d1a9ad..04b5ea84d36d35ec19c210e7ff96654718f580bb 100644
GIT binary patch
delta 8
PcmX@Wcz|)jenwFM4=4iY

delta 8
PcmX@Wcz|)jenw#c4<-WU

diff --git a/onnx/backend/test/data/node/test_reshape_extended_dims/model.onnx b/onnx/backend/test/data/node/test_reshape_extended_dims/model.onnx
index 30b5a991178978a5e2ce1a773e9f15e6810bf2be..17130b23bf9e833582a6a4f8dc53deecdb0ec12c 100644
GIT binary patch
delta 8
PcmZ3(xQ21UYDQ524b%dx

delta 8
PcmZ3(xQ21UYDQrI4blRt

diff --git a/onnx/backend/test/data/node/test_reshape_negative_dim/model.onnx b/onnx/backend/test/data/node/test_reshape_negative_dim/model.onnx
index 74249bc77208d611a544e7bffad2533b927d3e9e..a1ec81fdb504f73f9e96c0badf17c8de51628928 100644
GIT binary patch
delta 8
PcmZ3^xSVmqGDcAV4Sxci

delta 8
PcmZ3^xSVmqGDcwl4SfQe

diff --git a/onnx/backend/test/data/node/test_reshape_negative_extended_dims/model.onnx b/onnx/backend/test/data/node/test_reshape_negative_extended_dims/model.onnx
index 2d56b193e6abe44fff952577dc2cbca27d644e89..a5e66ec970462cff0a797bcb4c1cf2e909009131 100644
GIT binary patch
delta 8
PcmdnWxRr6j7DiD34s8O*

delta 8
PcmdnWxRr6j7DizJ4r>C%

diff --git a/onnx/backend/test/data/node/test_reshape_one_dim/model.onnx b/onnx/backend/test/data/node/test_reshape_one_dim/model.onnx
index 01e96bbfce6c9cf3f93b85825b730956a75ebfe2..d960b3df340dab71ec231084d3e7cf61bba493af 100644
GIT binary patch
delta 8
PcmbQmIE!(@Oh!=v459*d

delta 8
PcmbQmIE!(@Oh#b<44?vZ

diff --git a/onnx/backend/test/data/node/test_reshape_reduced_dims/model.onnx b/onnx/backend/test/data/node/test_reshape_reduced_dims/model.onnx
index 8bcb0d029c0a956e2fda7a8b0dc63503e6d47fda..8075221d3cc4d75ef7648ef48fa64045f25678f4 100644
GIT binary patch
delta 8
PcmZ3?xR`OmB1TaF4Lbsn

delta 8
PcmZ3?xR`OmB1T~V4LJgj

diff --git a/onnx/backend/test/data/node/test_reshape_reordered_all_dims/model.onnx b/onnx/backend/test/data/node/test_reshape_reordered_all_dims/model.onnx
index b9a38104afd8eff0fa3acffa46a0ed949b9f3388..c85c55c871d178ff724d8a40152bdd7b298cc20f 100644
GIT binary patch
delta 8
PcmZ3>xR!Ck8b(n74dnu_

delta 8
PcmZ3>xR!Ck8b)CN4dVi>

diff --git a/onnx/backend/test/data/node/test_reshape_reordered_last_dims/model.onnx b/onnx/backend/test/data/node/test_reshape_reordered_last_dims/model.onnx
index dc0ff1900292dbfb780a29f989402f35bfb8bf3b..20cfbab2211af91417baf075f937fc738baeeb99 100644
GIT binary patch
delta 8
PcmZ3-xQ=ncT1HU-4fX=E

delta 8
PcmZ3-xQ=ncT1H_24fF!A

diff --git a/onnx/backend/test/data/node/test_reshape_zero_and_negative_dim/model.onnx b/onnx/backend/test/data/node/test_reshape_zero_and_negative_dim/model.onnx
index e116649dca87284a35528925a9a42a23f1de85a1..a2a87aa771241fb851d2070fcb54902814b4d120 100644
GIT binary patch
delta 8
PcmdnOxP@`TW=2r}4qO7n

delta 8
PcmdnOxP@`TW=3HE4q5`j

diff --git a/onnx/backend/test/data/node/test_reshape_zero_dim/model.onnx b/onnx/backend/test/data/node/test_reshape_zero_dim/model.onnx
index 31813cf82d7912e19d666ee3e2daed5b3d0d8b4b..624398f5bd8ed691edce38ba6037c596bc388d63 100644
GIT binary patch
delta 8
PcmZ3^xSVmqGDcAV4Sxci

delta 8
PcmZ3^xSVmqGDcwl4SfQe

diff --git a/onnx/backend/test/data/node/test_shape/model.onnx b/onnx/backend/test/data/node/test_shape/model.onnx
index c84685d71a8d195331875e0cedbc49e9195347bb..887ac6d0874efef0964b894451bd04705191c3b7 100644
GIT binary patch
delta 6
Ncma!zoe;w)3IGRi0lfeK

delta 6
Ncma!zoe;w)3;+ji0lNSI

diff --git a/onnx/backend/test/data/node/test_shape_clip_end/model.onnx b/onnx/backend/test/data/node/test_shape_clip_end/model.onnx
index 4d415219403c653cf7c4f1990a0abd9f38463e92..68b3111d90b16e2c120ef89fd27d166ec41aefe9 100644
GIT binary patch
delta 6
NcmXRano!6n3IGU}0sH^}

delta 6
NcmXRano!6n3;+m}0r~&{

diff --git a/onnx/backend/test/data/node/test_shape_clip_start/model.onnx b/onnx/backend/test/data/node/test_shape_clip_start/model.onnx
index 9d2934b6f5450c372462323a4bf433e04696aa25..68f55c4a3b9835b02d8924db2885c7fa8f7aae48 100644
GIT binary patch
delta 6
Ncmb=gpHRmr3IGXA0wVwb

delta 6
Ncmb=gpHRmr3;+pA0wDkZ

diff --git a/onnx/backend/test/data/node/test_shape_end_1/model.onnx b/onnx/backend/test/data/node/test_shape_end_1/model.onnx
index 1a5828f5534ddc31cfd7a4d6642e3cc3438d5533..51ea945c526ae057807c599815ec8cb3a79dca47 100644
GIT binary patch
delta 6
Ncmd1LpOD8W3IGUf0rLO=

delta 6
Ncmd1LpOD8W3;+mf0r3C;

diff --git a/onnx/backend/test/data/node/test_shape_end_negative_1/model.onnx b/onnx/backend/test/data/node/test_shape_end_negative_1/model.onnx
index 36f445c8a107b36cfdcbaf298602cc2bdf3fda57..bc03f04e1a41821ee1f17fd5a2ef39fdc5b7dc41 100644
GIT binary patch
delta 8
PcmZo<Y-F6!z$gj;3i$#n

delta 8
PcmZo<Y-F6!z$gp=3ikpj

diff --git a/onnx/backend/test/data/node/test_shape_example/model.onnx b/onnx/backend/test/data/node/test_shape_example/model.onnx
index d7954ecd5799f4dcef5844006c9461299aa31211..41b204439b4b3ea6ed7a09297f841b52dd096025 100644
GIT binary patch
delta 6
NcmYdHoRGjM3IGSF0m%RW

delta 6
NcmYdHoRGjM3;+kF0mlFU

diff --git a/onnx/backend/test/data/node/test_shape_start_1/model.onnx b/onnx/backend/test/data/node/test_shape_start_1/model.onnx
index 5bca0316614affb635c29252e689ed56c350a2d7..e8756beb05139d8e8b7875db60dc271376133d92 100644
GIT binary patch
delta 6
NcmXReo>0Un3IGVC0sjC1

delta 6
NcmXReo>0Un3;+nC0sQ~~

diff --git a/onnx/backend/test/data/node/test_shape_start_1_end_2/model.onnx b/onnx/backend/test/data/node/test_shape_start_1_end_2/model.onnx
index 1bdfa713ef450c09763ada0eec8c7f1a29aabd98..c3769fefc16af1c8935282d35d25e3362785b756 100644
GIT binary patch
delta 8
PcmZo=Y-OC#!YB#=3q1li

delta 8
PcmZo=Y-OC#!YB*?3p)Ze

diff --git a/onnx/backend/test/data/node/test_shape_start_1_end_negative_1/model.onnx b/onnx/backend/test/data/node/test_shape_start_1_end_negative_1/model.onnx
index 196b6fe0b098265d07d1b29a2d4ddaece40cfe44..b65b12fccc9d985e2afcc50837a74f0bd05fd3c6 100644
GIT binary patch
delta 8
PcmbQvIGu6AG)7SX3~vH$

delta 8
PcmbQvIGu6AG)7?n3~d5y

diff --git a/onnx/backend/test/data/node/test_shape_start_negative_1/model.onnx b/onnx/backend/test/data/node/test_shape_start_negative_1/model.onnx
index 9b92180609413b266b40959a700760de56b4d7a9..9da989aea7bc3d0b7160a502cf01c7e86580753d 100644
GIT binary patch
delta 8
PcmZo=Y-OC#!YB#=3q1li

delta 8
PcmZo=Y-OC#!YB*?3p)Ze

diff --git a/onnx/backend/test/data/node/test_size/model.onnx b/onnx/backend/test/data/node/test_size/model.onnx
index 215968302d2e504c8c61457b247c0b32fd276961..710a1b07a8dbb83e78e461c73527d8944f7d92d7 100644
GIT binary patch
delta 6
NcmWF!pAg0<3IGQj0jmH2

delta 6
NcmWF!pAg0<3;+ij0jU50

diff --git a/onnx/backend/test/data/node/test_size_example/model.onnx b/onnx/backend/test/data/node/test_size_example/model.onnx
index 1cff6f9befeceeef46e3e5699ca200daf66db920..76be00a1f7d89e2087cb58928aea6968e81edf22 100644
GIT binary patch
delta 6
Ncma!!o)E<-3IGRG0k;4E

delta 6
Ncma!!o)E<-3;+jG0kr@C

diff --git a/onnx/backend/test/data/node/test_squeeze/model.onnx b/onnx/backend/test/data/node/test_squeeze/model.onnx
index 690e8bb38da70d491c69cdadbbffe0999f78f686..24b292cda8e98b76dfeb4f46172c17d5182a9b3f 100644
GIT binary patch
delta 12
TcmZo?Y-eQR;GD?R!YB#=5V!(S

delta 12
TcmZo?Y-eQRV4uj;!pI8%5TpV}

diff --git a/onnx/backend/test/data/node/test_squeeze_negative_axes/model.onnx b/onnx/backend/test/data/node/test_squeeze_negative_axes/model.onnx
index 08c0c320b2d249dae85e2d2e5c0a992675f77064..68ddeceb2ba7ff4a998bc276ee8db31cc0e3229b 100644
GIT binary patch
delta 12
TcmbQrIF*r!gL5L&Bt}sH5*7k`

delta 12
TcmbQrIF*r!gMA{?Bt~8U5&{Bo

diff --git a/onnx/backend/test/data/node/test_transpose_all_permutations_0/model.onnx b/onnx/backend/test/data/node/test_transpose_all_permutations_0/model.onnx
index 5b917e3bb2398061cf3cb499ec1bd27265f9aadc..ee1996e2aaf999372b6fcfde18f26b0c8157770a 100644
GIT binary patch
delta 12
TcmZ3*xQda9gL5L&az;@A6e9w<

delta 12
TcmZ3*xQda9gMA{?az<VN6b}Nh

diff --git a/onnx/backend/test/data/node/test_transpose_all_permutations_1/model.onnx b/onnx/backend/test/data/node/test_transpose_all_permutations_1/model.onnx
index 1a6e00e49484db7a74b814cc3e43ae9d1ea7ca27..12135662b842ffdaf94a5d7ae0be678c28b6f551 100644
GIT binary patch
delta 12
TcmZ3*xQda9gL5L&az;@A6e9w<

delta 12
TcmZ3*xQda9gMA{?az<VN6b}Nh

diff --git a/onnx/backend/test/data/node/test_transpose_all_permutations_2/model.onnx b/onnx/backend/test/data/node/test_transpose_all_permutations_2/model.onnx
index 78f36c96786de25776f12ae03da0b9ecba48ed84..dde23a61c7801c6945f8ef24f40970d59ae4a0d9 100644
GIT binary patch
delta 12
TcmZ3*xQda9gL5L&az;@A6e9w<

delta 12
TcmZ3*xQda9gMA{?az<VN6b}Nh

diff --git a/onnx/backend/test/data/node/test_transpose_all_permutations_3/model.onnx b/onnx/backend/test/data/node/test_transpose_all_permutations_3/model.onnx
index 6f437173ab6dcb67a8d472f828eaf8f924f84661..e766ab7aad39ae273632530c0ce9fea468872f19 100644
GIT binary patch
delta 12
TcmZ3*xQda9gL5L&az;@A6e9w<

delta 12
TcmZ3*xQda9gMA{?az<VN6b}Nh

diff --git a/onnx/backend/test/data/node/test_transpose_all_permutations_4/model.onnx b/onnx/backend/test/data/node/test_transpose_all_permutations_4/model.onnx
index 66e65f146ecad4f384c99b232ea6771e0274ba63..bb9b53aa9a9072b0c191fe3b5db965dd46e16c43 100644
GIT binary patch
delta 12
TcmZ3*xQda9gL5L&az;@A6e9w<

delta 12
TcmZ3*xQda9gMA{?az<VN6b}Nh

diff --git a/onnx/backend/test/data/node/test_transpose_all_permutations_5/model.onnx b/onnx/backend/test/data/node/test_transpose_all_permutations_5/model.onnx
index 6a6e6a8bfe3f949372c4774a784e6dc63512bf50..08b7714821247433f2b6c907735a0376bf582221 100644
GIT binary patch
delta 12
TcmZ3*xQda9gL5L&az;@A6e9w<

delta 12
TcmZ3*xQda9gMA{?az<VN6b}Nh

diff --git a/onnx/backend/test/data/node/test_transpose_default/model.onnx b/onnx/backend/test/data/node/test_transpose_default/model.onnx
index fae47c91092dd33c592c8c86958fee621d19ea1c..0ec7aaaa28e43686edf1278768ec01257ec27594 100644
GIT binary patch
delta 12
TcmeBW>}6!);GD?R#V85@5l#YQ

delta 12
TcmeBW>}6!)V4uj;#mEZ)5jp}{

diff --git a/onnx/backend/test/data/node/test_unsqueeze_axis_0/model.onnx b/onnx/backend/test/data/node/test_unsqueeze_axis_0/model.onnx
index fce8843340a6a2edbc415a817d662c2e0b9f32d9..491dfed9787e66d47b598ca9927cfc4298adc336 100644
GIT binary patch
delta 12
TcmbQlIEj&ogL5KNKcgrB5z7K{

delta 12
TcmbQlIEj&ogMA`XKO-*y5w`+p

diff --git a/onnx/backend/test/data/node/test_unsqueeze_axis_1/model.onnx b/onnx/backend/test/data/node/test_unsqueeze_axis_1/model.onnx
index 5812558243d5e0d437248a59697c00e7a35c8702..c6a7f5f5767c338001fe7f74a46906415abffe55 100644
GIT binary patch
delta 12
TcmbQlIEj&ogL5KNKcgrB5z7K{

delta 12
TcmbQlIEj&ogMA`XKO-*y5w`+p

diff --git a/onnx/backend/test/data/node/test_unsqueeze_axis_2/model.onnx b/onnx/backend/test/data/node/test_unsqueeze_axis_2/model.onnx
index 33d20cf4c72422585e9495a541c8140d533b4297..d48805b577e46a698b0bb1a5bbd8cf450dfa601b 100644
GIT binary patch
delta 12
TcmbQlIEj&ogL5KNKcgrB5z7K{

delta 12
TcmbQlIEj&ogMA`XKO-*y5w`+p

diff --git a/onnx/backend/test/data/node/test_unsqueeze_negative_axes/model.onnx b/onnx/backend/test/data/node/test_unsqueeze_negative_axes/model.onnx
index b09f253a6ae276c300e7a5772731eee373e924c4..e120c4c017bf83e3df2eb916dbc1a1245c0c30b0 100644
GIT binary patch
delta 12
TcmZ3)xQLO7gL5L&d`3|K6I%kJ

delta 12
TcmZ3)xQLO7gMA{?d`4aX6GsA=

diff --git a/onnx/backend/test/data/node/test_unsqueeze_three_axes/model.onnx b/onnx/backend/test/data/node/test_unsqueeze_three_axes/model.onnx
index b2c00927df9a22807ecb438d195d483863c3e148..0e61ba73dbda0755852d1f966b6f97fc0095d4d3 100644
GIT binary patch
delta 12
TcmbQwIG>S;gL5L&97a(96A%KK

delta 12
TcmbQwIG>S;gMA{?97bLM68r*>

diff --git a/onnx/backend/test/data/node/test_unsqueeze_two_axes/model.onnx b/onnx/backend/test/data/node/test_unsqueeze_two_axes/model.onnx
index 7d975a149119dd789bd5a4293aee130b6a51201d..c669733120cc3bc23af9abc96d572c9b301c6a79 100644
GIT binary patch
delta 12
TcmbQqIFpfygL5L&G)7SX5_$rM

delta 12
TcmbQqIFpfygMA{?G)7(k5@rH@

diff --git a/onnx/backend/test/data/node/test_unsqueeze_unsorted_axes/model.onnx b/onnx/backend/test/data/node/test_unsqueeze_unsorted_axes/model.onnx
index 65623f875c0e486dffb9354db2ffe9cf87ad3088..0643b22f20e4916b4a2c3833b115efb24be652ed 100644
GIT binary patch
delta 12
TcmZ3)xQLO7gL5L&d`3|K6I%kJ

delta 12
TcmZ3)xQLO7gMA{?d`4aX6GsA=

diff --git a/onnx/backend/test/data/node/test_wrap_pad/model.onnx b/onnx/backend/test/data/node/test_wrap_pad/model.onnx
index d666209ad2637f489be567fec2a41987d958ff21..84829b22648d723dddd5434e3aa52d673dad50ab 100644
GIT binary patch
delta 8
PcmbQmIE!(@Oh!=v459*d

delta 8
PcmbQmIE!(@Oh#b<44?vZ

diff --git a/onnx/checker.cc b/onnx/checker.cc
index d39b291fb97..e2736e365a1 100644
--- a/onnx/checker.cc
+++ b/onnx/checker.cc
@@ -265,6 +265,8 @@ void check_tensor(const TensorProto& tensor, const CheckerContext& ctx) {
       case TensorProto::FLOAT8E4M3FNUZ:
       case TensorProto::FLOAT8E5M2:
       case TensorProto::FLOAT8E5M2FNUZ:
+      case TensorProto::UINT4:
+      case TensorProto::INT4:
         check_field(int32_data);
         break;
 
diff --git a/onnx/defs/controlflow/defs.cc b/onnx/defs/controlflow/defs.cc
index f6bd6d64c7a..4477331a886 100644
--- a/onnx/defs/controlflow/defs.cc
+++ b/onnx/defs/controlflow/defs.cc
@@ -12,10 +12,10 @@
 namespace ONNX_NAMESPACE {
 using SupportType = OpSchema::SupportType;
 
-static std::vector<std::string> control_flow_types_ir9() {
-  auto t = OpSchema::all_tensor_types_ir9();
-  auto s = OpSchema::all_tensor_sequence_types_ir9();
-  auto o = OpSchema::all_optional_types_ir9();
+static std::vector<std::string> control_flow_types_ir10() {
+  auto t = OpSchema::all_tensor_types_ir10();
+  auto s = OpSchema::all_tensor_sequence_types_ir10();
+  auto o = OpSchema::all_optional_types_ir10();
   t.insert(t.end(), s.begin(), s.end());
   t.insert(t.end(), o.begin(), o.end());
   return t;
@@ -23,7 +23,7 @@ static std::vector<std::string> control_flow_types_ir9() {
 
 ONNX_OPERATOR_SET_SCHEMA(
     If,
-    19,
+    21,
     OpSchema()
         .SetDoc("If conditional")
         .Input(0, "cond", "Condition for the if. The tensor must contain a single element.", "B")
@@ -63,12 +63,12 @@ ONNX_OPERATOR_SET_SCHEMA(
             AttributeProto::GRAPH)
         .TypeConstraint(
             "V",
-            control_flow_types_ir9(),
-            "All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv9.")
+            control_flow_types_ir10(),
+            "All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv10.")
         .TypeConstraint("B", {"tensor(bool)"}, "Only bool")
         .TypeAndShapeInferenceFunction(IfInferenceFunction));
 
-static const char* Loop_ver19_doc = R"DOC(
+static const char* Loop_ver16_doc = R"DOC(
 Generic Looping construct. This loop has multiple termination conditions:
 
 1) Trip count. Iteration count specified at runtime. Set by
@@ -208,9 +208,9 @@ The input/output of subgraph (produced by loop node) matching is based on order
 
 ONNX_OPERATOR_SET_SCHEMA(
     Loop,
-    19,
+    21,
     OpSchema()
-        .SetDoc(Loop_ver19_doc)
+        .SetDoc(Loop_ver16_doc)
         .Input(
             0,
             "M",
@@ -253,13 +253,13 @@ ONNX_OPERATOR_SET_SCHEMA(
             AttributeProto::GRAPH)
         .TypeConstraint(
             "V",
-            control_flow_types_ir9(),
-            "All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv9.")
+            control_flow_types_ir10(),
+            "All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv10.")
         .TypeConstraint("I", {"tensor(int64)"}, "tensor of int64, which should be a scalar.")
         .TypeConstraint("B", {"tensor(bool)"}, "tensor of bool, which should be a scalar.")
         .TypeAndShapeInferenceFunction(LoopInferenceFunction));
 
-static const char* scan_19_doc = R"DOC(
+static const char* scan_16_doc = R"DOC(
 Scan can be used to iterate over one or more scan_input tensors,
 constructing zero or more scan_output tensors. It combines ideas from general recurrences,
 functional programming constructs such as scan, fold, map, and zip, and is intended to enable
@@ -385,9 +385,9 @@ values are computed in the outer graph, they need to be passed in as extra state
 
 ONNX_OPERATOR_SET_SCHEMA(
     Scan,
-    19,
+    21,
     OpSchema()
-        .SetDoc(scan_19_doc)
+        .SetDoc(scan_16_doc)
         .Input(
             0,
             "initial_state_and_scan_inputs",
@@ -448,7 +448,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "range is [-r, r-1].",
             AttributeProto::INTS,
             false)
-        .TypeConstraint("V", OpSchema::all_tensor_types_ir9(), "All Tensor types up to IRv9.")
+        .TypeConstraint("V", OpSchema::all_tensor_types_ir10(), "All Tensor types up to IRv10.")
         .TypeAndShapeInferenceFunction(ScanInferenceFunction)); // Shares same shape inference as opset 11
 
 } // namespace ONNX_NAMESPACE
diff --git a/onnx/defs/controlflow/old.cc b/onnx/defs/controlflow/old.cc
index 2ade6bb8cb6..079cbc10e84 100644
--- a/onnx/defs/controlflow/old.cc
+++ b/onnx/defs/controlflow/old.cc
@@ -11,6 +11,15 @@ namespace ONNX_NAMESPACE {
 
 using SupportType = OpSchema::SupportType;
 
+static std::vector<std::string> control_flow_types_ir9() {
+  auto t = OpSchema::all_tensor_types_ir9();
+  auto s = OpSchema::all_tensor_sequence_types_ir9();
+  auto o = OpSchema::all_optional_types_ir9();
+  t.insert(t.end(), s.begin(), s.end());
+  t.insert(t.end(), o.begin(), o.end());
+  return t;
+}
+
 static std::vector<std::string> control_flow_types_ir4() {
   auto t = OpSchema::all_tensor_types_ir4();
   auto s = OpSchema::all_tensor_sequence_types_ir4();
@@ -20,6 +29,53 @@ static std::vector<std::string> control_flow_types_ir4() {
   return t;
 }
 
+ONNX_OPERATOR_SET_SCHEMA(
+    If,
+    19,
+    OpSchema()
+        .SetDoc("If conditional")
+        .Input(0, "cond", "Condition for the if. The tensor must contain a single element.", "B")
+        .Output(
+            0,
+            "outputs",
+            "Values that are live-out to the enclosing scope. The return values in "
+            "the `then_branch` and `else_branch` must be of the same data type. "
+            "The `then_branch` and `else_branch` may produce tensors with the same "
+            "element type and different shapes. "
+            "If corresponding outputs from the then-branch and the else-branch have "
+            "static shapes S1 and S2, then the shape of the corresponding output "
+            "variable of the if-node (if present) must be compatible with both S1 "
+            "and S2 as it represents the union of both possible shapes."
+            "For example, if in a model file, the first "
+            "output of `then_branch` is typed float tensor with shape [2] and the "
+            "first output of `else_branch` is another float tensor with shape [3], "
+            "If's first output should have (a) no shape set, or (b) "
+            "a shape of rank 1 with neither `dim_value` nor `dim_param` set, or (c) "
+            "a shape of rank 1 with a unique `dim_param`. "
+            "In contrast, the first output cannot have the shape [2] since [2] and "
+            "[3] are not compatible.",
+            "V",
+            OpSchema::Variadic,
+            false)
+        .Attr(
+            "then_branch",
+            "Graph to run if condition is true. Has N outputs: values you wish to "
+            "be live-out to the enclosing scope. The number of outputs must match"
+            " the number of outputs in the else_branch.",
+            AttributeProto::GRAPH)
+        .Attr(
+            "else_branch",
+            "Graph to run if condition is false. Has N outputs: values you wish to"
+            " be live-out to the enclosing scope. The number of outputs must match"
+            " the number of outputs in the then_branch.",
+            AttributeProto::GRAPH)
+        .TypeConstraint(
+            "V",
+            control_flow_types_ir9(),
+            "All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv9.")
+        .TypeConstraint("B", {"tensor(bool)"}, "Only bool")
+        .TypeAndShapeInferenceFunction(IfInferenceFunction));
+
 ONNX_OPERATOR_SET_SCHEMA(
     If,
     16,
@@ -205,6 +261,59 @@ point-wise operators (e.g. dropout, residual connections, linear layer).
 The input/output of subgraph (produced by loop node) matching is based on order instead of name. The implementation will figure out the names based on this order.
 )DOC";
 
+ONNX_OPERATOR_SET_SCHEMA(
+    Loop,
+    19,
+    OpSchema()
+        .SetDoc(Loop_ver16_doc)
+        .Input(
+            0,
+            "M",
+            "A maximum trip-count for the loop specified at runtime. Optional."
+            " Pass empty string to skip.",
+            "I",
+            OpSchema::Optional)
+        .Input(
+            1,
+            "cond",
+            "A boolean termination condition. Optional. Pass empty string to skip.",
+            "B",
+            OpSchema::Optional)
+        .Input(
+            2,
+            "v_initial",
+            "The initial values of any loop-carried dependencies (values that "
+            "change across loop iterations)",
+            "V",
+            OpSchema::Variadic,
+            false,
+            0)
+        .Output(
+            0,
+            "v_final_and_scan_outputs",
+            "Final N loop carried dependency values then K scan_outputs. "
+            "Scan outputs must be Tensors.",
+            "V",
+            OpSchema::Variadic,
+            false)
+        .Attr(
+            "body",
+            "The graph run each iteration. It has 2+N inputs: (iteration_num, "
+            "condition, loop carried dependencies...). It has 1+N+K outputs: "
+            "(condition, loop carried dependencies..., scan_outputs...). Each "
+            "scan_output is created by concatenating the value of the specified "
+            "output value at the end of each iteration of the loop. It is an error"
+            " if the dimensions or data type of these scan_outputs change across loop"
+            " iterations.",
+            AttributeProto::GRAPH)
+        .TypeConstraint(
+            "V",
+            control_flow_types_ir9(),
+            "All Tensor, Sequence(Tensor), Optional(Tensor), and Optional(Sequence(Tensor)) types up to IRv9.")
+        .TypeConstraint("I", {"tensor(int64)"}, "tensor of int64, which should be a scalar.")
+        .TypeConstraint("B", {"tensor(bool)"}, "tensor of bool, which should be a scalar.")
+        .TypeAndShapeInferenceFunction(LoopInferenceFunction));
+
 ONNX_OPERATOR_SET_SCHEMA(
     Loop,
     16,
@@ -382,6 +491,74 @@ values are computed in the outer graph, they need to be passed in as extra state
 
 )DOC";
 
+ONNX_OPERATOR_SET_SCHEMA(
+    Scan,
+    19,
+    OpSchema()
+        .SetDoc(scan_16_doc)
+        .Input(
+            0,
+            "initial_state_and_scan_inputs",
+            "Initial values of the loop's N state variables followed by M scan_inputs",
+            "V",
+            OpSchema::Variadic,
+            false)
+        .Output(
+            0,
+            "final_state_and_scan_outputs",
+            "Final values of the loop's N state variables followed by K scan_outputs",
+            "V",
+            OpSchema::Variadic,
+            false)
+        .Attr(
+            "body",
+            "The graph run each iteration. It has N+M inputs: "
+            "(loop state variables..., scan_input_elts...). It has N+K outputs: "
+            "(loop state variables..., scan_output_elts...). Each "
+            "scan_output is created by concatenating the value of the specified "
+            "scan_output_elt value at the end of each iteration of the loop. It is an error"
+            " if the dimensions of these values change across loop iterations.",
+            AttributeProto::GRAPH,
+            true)
+        .Attr("num_scan_inputs", "An attribute specifying the number of scan_inputs M. ", AttributeProto::INT, true)
+        .Attr(
+            "scan_input_directions",
+            "An optional list of M flags. The i-th element of the list specifies the direction "
+            "to be scanned for the i-th scan_input tensor: 0 indicates forward direction and 1 "
+            "indicates reverse direction. "
+            "If omitted, all scan_input tensors will be scanned in the forward direction.",
+            AttributeProto::INTS,
+            false)
+        .Attr(
+            "scan_output_directions",
+            "An optional list of K flags, one for each scan_output. The i-th element of the list "
+            "specifies whether the i-th scan_output should be constructed by appending or "
+            "prepending a new value in each iteration: 0 indicates appending and 1 "
+            "indicates prepending. "
+            "If omitted, all scan_output tensors will be produced by appending a value "
+            "in each iteration.",
+            AttributeProto::INTS,
+            false)
+        .Attr(
+            "scan_input_axes",
+            "An optional list of M flags. The i-th element of the list specifies the axis "
+            "to be scanned (the sequence axis) for the i-th scan_input. If omitted, 0 will "
+            "be used as the scan axis for every scan_input. Negative value for an axis means "
+            "counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(input).",
+            AttributeProto::INTS,
+            false)
+        .Attr(
+            "scan_output_axes",
+            "An optional list of K flags. The i-th element of the list specifies the axis "
+            "for the i-th scan_output. The scan outputs are accumulated along the specified "
+            "axis. If omitted, 0 will be used as the scan axis for every scan_output. "
+            "Negative value for an axis means counting dimensions from the back. Accepted "
+            "range is [-r, r-1].",
+            AttributeProto::INTS,
+            false)
+        .TypeConstraint("V", OpSchema::all_tensor_types_ir9(), "All Tensor types up to IRv9.")
+        .TypeAndShapeInferenceFunction(ScanInferenceFunction)); // Shares same shape inference as opset 11
+
 ONNX_OPERATOR_SET_SCHEMA(
     Scan,
     16,
diff --git a/onnx/defs/data_type_utils.cc b/onnx/defs/data_type_utils.cc
index fb03bfb5652..f902be8ccf6 100644
--- a/onnx/defs/data_type_utils.cc
+++ b/onnx/defs/data_type_utils.cc
@@ -440,6 +440,8 @@ TypesWrapper::TypesWrapper() {
   type_str_to_tensor_data_type_["float8e4m3fnuz"] = TensorProto_DataType_FLOAT8E4M3FNUZ;
   type_str_to_tensor_data_type_["float8e5m2"] = TensorProto_DataType_FLOAT8E5M2;
   type_str_to_tensor_data_type_["float8e5m2fnuz"] = TensorProto_DataType_FLOAT8E5M2FNUZ;
+  type_str_to_tensor_data_type_["uint4"] = TensorProto_DataType_UINT4;
+  type_str_to_tensor_data_type_["int4"] = TensorProto_DataType_INT4;
 
   for (auto& str_type_pair : type_str_to_tensor_data_type_) {
     tensor_data_type_to_type_str_[str_type_pair.second] = str_type_pair.first;
diff --git a/onnx/defs/generator/defs.cc b/onnx/defs/generator/defs.cc
index 34e378ee17c..e25c62e1272 100644
--- a/onnx/defs/generator/defs.cc
+++ b/onnx/defs/generator/defs.cc
@@ -17,7 +17,7 @@ or value_* must be specified.
 
 ONNX_OPERATOR_SET_SCHEMA(
     Constant,
-    19,
+    21,
     OpSchema()
         .SetDoc(Constant_ver19_doc)
         .Attr("value", "The value for the elements of the output tensor.", AttributeProto::TENSOR, false)
@@ -57,7 +57,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             AttributeProto::STRINGS,
             false)
         .Output(0, "output", "Output tensor containing the same value of the provided tensor.", "T")
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Constrain input and output types to all tensor types.")
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir10(), "Constrain input and output types to all tensor types.")
         .TypeAndShapeInferenceFunction(ConstantOpInference));
 
 static const char* ConstantOfShape_ver20_doc = R"DOC(
@@ -66,7 +66,7 @@ Generate a tensor with given value and shape.
 
 ONNX_OPERATOR_SET_SCHEMA(
     ConstantOfShape,
-    20,
+    21,
     OpSchema()
         .SetDoc(ConstantOfShape_ver20_doc)
         .Attr(
@@ -90,26 +90,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "defaults to float32.",
             "T2")
         .TypeConstraint("T1", {"tensor(int64)"}, "Constrain input types.")
-        .TypeConstraint(
-            "T2",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(bool)",
-             "tensor(bfloat16)",
-             "tensor(float8e4m3fn)",
-             "tensor(float8e4m3fnuz)",
-             "tensor(float8e5m2)",
-             "tensor(float8e5m2fnuz)"},
-            "Constrain output types to be numerics.")
+        .TypeConstraint("T2", OpSchema::all_numeric_types_ir10(), "Constrain output types to be numerics.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           if (ctx.getAttribute("value") != nullptr) {
             propagateElemTypeFromDtypeToOutput(ctx, ctx.getAttribute("value"), 0);
diff --git a/onnx/defs/generator/old.cc b/onnx/defs/generator/old.cc
index d7c381f5bd5..a0cf5b0a22c 100644
--- a/onnx/defs/generator/old.cc
+++ b/onnx/defs/generator/old.cc
@@ -9,6 +9,57 @@
 #include "onnx/defs/schema.h"
 
 namespace ONNX_NAMESPACE {
+
+static const char* Constant_ver19_doc = R"DOC(
+This operator produces a constant tensor. Exactly one of the provided attributes, either value, sparse_value,
+or value_* must be specified.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Constant,
+    19,
+    OpSchema()
+        .SetDoc(Constant_ver19_doc)
+        .Attr("value", "The value for the elements of the output tensor.", AttributeProto::TENSOR, false)
+        .Attr(
+            "sparse_value",
+            "The value for the elements of the output tensor in sparse format.",
+            AttributeProto::SPARSE_TENSOR,
+            false)
+        .Attr(
+            "value_int",
+            "The value for the sole element for the scalar, int64, output tensor.",
+            AttributeProto::INT,
+            false)
+        .Attr(
+            "value_ints",
+            "The values for the elements for the 1D, int64, output tensor.",
+            AttributeProto::INTS,
+            false)
+        .Attr(
+            "value_float",
+            "The value for the sole element for the scalar, float32, output tensor.",
+            AttributeProto::FLOAT,
+            false)
+        .Attr(
+            "value_floats",
+            "The values for the elements for the 1D, float32, output tensor.",
+            AttributeProto::FLOATS,
+            false)
+        .Attr(
+            "value_string",
+            "The value for the sole element for the scalar, UTF-8 string, output tensor.",
+            AttributeProto::STRING,
+            false)
+        .Attr(
+            "value_strings",
+            "The values for the elements for the 1D, UTF-8 string, output tensor.",
+            AttributeProto::STRINGS,
+            false)
+        .Output(0, "output", "Output tensor containing the same value of the provided tensor.", "T")
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Constrain input and output types to all tensor types.")
+        .TypeAndShapeInferenceFunction(ConstantOpInference));
+
 static const char* Constant_ver13_doc = R"DOC(
 This operator produces a constant tensor. Exactly one of the provided attributes, either value, sparse_value,
 or value_* must be specified.
@@ -202,6 +253,70 @@ ONNX_OPERATOR_SET_SCHEMA(
               "One of the attributes 'value' or 'sparse_value' must be specified for a Constant node.");
         }));
 
+static const char* ConstantOfShape_ver20_doc = R"DOC(
+Generate a tensor with given value and shape.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    ConstantOfShape,
+    20,
+    OpSchema()
+        .SetDoc(ConstantOfShape_ver20_doc)
+        .Attr(
+            "value",
+            "(Optional) The value of the output elements."
+            "Should be a one-element tensor. If not specified, it defaults to a tensor of value 0 and datatype float32",
+            AttributeProto::TENSOR,
+            OPTIONAL_VALUE)
+        .Input(
+            0,
+            "input",
+            "1D tensor. The shape of the expected output tensor. If empty tensor is given, the output would be a scalar."
+            " All values must be >= 0.",
+            "T1")
+        .Output(
+            0,
+            "output",
+            "Output tensor of shape specified by 'input'."
+            "If attribute 'value' is specified, the value and datatype of the output tensor is taken from 'value'."
+            "If attribute 'value' is not specified, the value in the output defaults to 0, and the datatype "
+            "defaults to float32.",
+            "T2")
+        .TypeConstraint("T1", {"tensor(int64)"}, "Constrain input types.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(bool)",
+             "tensor(bfloat16)",
+             "tensor(float8e4m3fn)",
+             "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)",
+             "tensor(float8e5m2fnuz)"},
+            "Constrain output types to be numerics.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          if (ctx.getAttribute("value") != nullptr) {
+            propagateElemTypeFromDtypeToOutput(ctx, ctx.getAttribute("value"), 0);
+          } else {
+            propagateElemTypeFromDtypeToOutput(ctx, TensorProto::FLOAT, 0);
+          }
+
+          bool found = false;
+          TensorShapeProto output_shape = getShapeInput(ctx, 0, found);
+          if (found) {
+            *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape() = output_shape;
+          }
+        }));
+
 static const char* ConstantOfShape_ver9_doc = R"DOC(
 Generate a tensor with given value and shape.
 )DOC";
diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
index 45485175149..1f8bb2c6bc2 100644
--- a/onnx/defs/nn/defs.cc
+++ b/onnx/defs/nn/defs.cc
@@ -1997,7 +1997,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         )ONNX",
             18));
 
-static const char* Flatten_ver13_doc = R"DOC(
+static const char* Flatten_ver11_doc = R"DOC(
 Flattens the input tensor into a 2D matrix. If input tensor has shape
 (d_0, d_1, ... d_n) then the output will have shape
 (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
@@ -2005,9 +2005,9 @@ Flattens the input tensor into a 2D matrix. If input tensor has shape
 
 ONNX_OPERATOR_SET_SCHEMA(
     Flatten,
-    13,
+    21,
     OpSchema()
-        .SetDoc(Flatten_ver13_doc)
+        .SetDoc(Flatten_ver11_doc)
         .Input(0, "input", "A tensor of rank >= axis.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(
             0,
@@ -2021,7 +2021,10 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output to all tensor types.")
+        .TypeConstraint(
+            "T",
+            OpSchema::all_tensor_types_ir10(),
+            "Constrain input and output to all tensor types up to IRv10.")
         .Attr(
             "axis",
             "Indicate up to which input dimensions "
diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
index cc661e744ee..5c1c16c1b2b 100644
--- a/onnx/defs/nn/old.cc
+++ b/onnx/defs/nn/old.cc
@@ -96,6 +96,52 @@ Flattens the input tensor into a 2D matrix. If input tensor has shape
 (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
 )DOC";
 
+ONNX_OPERATOR_SET_SCHEMA(
+    Flatten,
+    13,
+    OpSchema()
+        .SetDoc(Flatten_ver11_doc)
+        .Input(0, "input", "A tensor of rank >= axis.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "A 2D tensor with the contents of the input tensor, "
+            "with input dimensions up to axis flattened to the outer dimension "
+            "of the output and remaining input dimensions flattened into the inner "
+            "dimension of the output.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output to all tensor types.")
+        .Attr(
+            "axis",
+            "Indicate up to which input dimensions "
+            "(exclusive) should be flattened to the outer dimension of the output. "
+            "The value for axis must be in the range [-r, r], where r is the rank of the input tensor. "
+            "Negative value means counting dimensions from the back. "
+            "When axis = 0, the shape of the output tensor is (1, (d_0 X d_1 ... d_n), "
+            "where the shape of the input tensor is (d_0, d_1, ... d_n). ",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (!hasInputShape(ctx, 0))
+            return;
+          auto& input_shape = getInputShape(ctx, 0);
+          int rank = static_cast<int>(input_shape.dim_size());
+          int axis = static_cast<int>(getAttribute(ctx, "axis", 1));
+          if (axis < 0) {
+            axis += rank;
+          }
+          if (axis > rank || axis < 0) {
+            fail_shape_inference("Invalid value(", axis, ") for attribute 'axis'");
+          }
+          // TODO: is the operation defined for input-rank < 2?
+          updateOutputShape(ctx, 0, {multiplyDims(input_shape, 0, axis), multiplyDims(input_shape, axis, rank)});
+        }));
+
 ONNX_OPERATOR_SET_SCHEMA(
     Flatten,
     11,
diff --git a/onnx/defs/operator_sets.h b/onnx/defs/operator_sets.h
index 9d489c690f0..9a37b8d0b84 100644
--- a/onnx/defs/operator_sets.h
+++ b/onnx/defs/operator_sets.h
@@ -1137,17 +1137,49 @@ class OpSet_Onnx_ver20 {
 };
 
 // Forward declarations for ai.onnx version 21
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Cast);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, CastLike);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Constant);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, ConstantOfShape);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, DequantizeLinear);
-class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, QuantizeLinear);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Flatten);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Identity);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, If);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Loop);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Pad);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, QLinearMatMul);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, QuantizeLinear);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Reshape);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Scan);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Shape);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Size);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Squeeze);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Transpose);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Unsqueeze);
 
 // Iterate over schema from ai.onnx version 21
 class OpSet_Onnx_ver21 {
  public:
   static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Cast)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, CastLike)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Constant)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, ConstantOfShape)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, DequantizeLinear)>());
-    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, QuantizeLinear)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Flatten)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Identity)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, If)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Loop)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Pad)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, QLinearMatMul)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, QuantizeLinear)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Reshape)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Scan)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Shape)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Size)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Squeeze)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Transpose)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 21, Unsqueeze)>());
   }
 };
 
diff --git a/onnx/defs/quantization/defs.cc b/onnx/defs/quantization/defs.cc
index 33e1f5099af..ad1053eee6f 100644
--- a/onnx/defs/quantization/defs.cc
+++ b/onnx/defs/quantization/defs.cc
@@ -11,7 +11,8 @@ static const char* QuantizeLinear_ver21_doc = R"DOC(
 The linear quantization operator. It consumes a high precision tensor, a scale, and a zero point to compute the low precision / quantized tensor.
 The scale factor and zero point must have same shape, and can be either a scalar for per-tensor / per layer quantization, or a 1-D tensor for per-axis quantization.
 The quantization formula is `y = saturate ((x / y_scale) + y_zero_point)`.
-For saturation, it saturates to [0, 255] if it's uint8, [-128, 127] if it's int8, [0, 65535] if it's uint16, or [-32768, 32767] if it's int16.
+For saturation, it saturates according to:
+uint8: [0, 255], int8: [-128, 127], uint16: [0, 65535], int16: [-32768, 32767], uint4: [0, 15], int4: [-8, 7]
 For (x / y_scale), it's rounding to the nearest even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
 'y_zero_point' and 'y' must have same type.
 'y_zero_point' is usually not used for quantization to float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz,
@@ -64,7 +65,9 @@ ONNX_OPERATOR_SET_SCHEMA(
              "tensor(float8e4m3fn)",
              "tensor(float8e4m3fnuz)",
              "tensor(float8e5m2)",
-             "tensor(float8e5m2fnuz)"},
+             "tensor(float8e5m2fnuz)",
+             "tensor(uint4)",
+             "tensor(int4)"},
             "The type of the input 'y_zero_point' and the output 'y'.")
         .SetDoc(QuantizeLinear_ver21_doc)
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
@@ -125,7 +128,9 @@ ONNX_OPERATOR_SET_SCHEMA(
              "tensor(float8e4m3fn)",
              "tensor(float8e4m3fnuz)",
              "tensor(float8e5m2)",
-             "tensor(float8e5m2fnuz)"},
+             "tensor(float8e5m2fnuz)",
+             "tensor(uint4)",
+             "tensor(int4)"},
             "The type of the inputs 'x_zero_point' and 'x'.")
         .TypeConstraint(
             "T2",
diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
index 218e60f42e6..05bee7e5de9 100644
--- a/onnx/defs/schema.h
+++ b/onnx/defs/schema.h
@@ -583,6 +583,10 @@ class OpSchema final {
   // Convenience members for types
 
   // All high-precision numeric types.
+  static const std::vector<std::string>& numeric_types_for_math_reduction_ir10() {
+    return numeric_types_for_math_reduction_ir9();
+  }
+
   static const std::vector<std::string>& numeric_types_for_math_reduction_ir9() {
     static const std::vector<std::string> numeric_types_for_math_reduction_ir9 = {
         "tensor(uint32)",
@@ -625,6 +629,29 @@ class OpSchema final {
     return numeric_types_for_math_reduction;
   }
 
+  static const std::vector<std::string>& all_numeric_types_ir10() {
+    static const std::vector<std::string> all_numeric_types_ir10 = {
+        "tensor(uint8)",
+        "tensor(uint16)",
+        "tensor(uint32)",
+        "tensor(uint64)",
+        "tensor(int8)",
+        "tensor(int16)",
+        "tensor(int32)",
+        "tensor(int64)",
+        "tensor(float16)",
+        "tensor(float)",
+        "tensor(double)",
+        "tensor(bfloat16)",
+        "tensor(float8e4m3fn)",
+        "tensor(float8e4m3fnuz)",
+        "tensor(float8e5m2)",
+        "tensor(float8e5m2fnuz)",
+        "tensor(uint4)",
+        "tensor(int4)"};
+    return all_numeric_types_ir10;
+  }
+
   static const std::vector<std::string>& all_numeric_types_ir9() {
     static const std::vector<std::string> all_numeric_types_ir9 = {
         "tensor(uint8)",
@@ -755,6 +782,10 @@ class OpSchema final {
     return all_float_types_ir9;
   }
 
+  static const std::vector<std::string>& all_float_types_ir10() {
+    return all_float_types_ir9();
+  }
+
   static const std::vector<std::string>& all_tensor_types_ir9() {
     static const std::vector<std::string> all_tensor_types_ir9 = {
         "tensor(uint8)",        "tensor(uint16)",         "tensor(uint32)",     "tensor(uint64)",
@@ -765,6 +796,19 @@ class OpSchema final {
     return all_tensor_types_ir9;
   }
 
+  static const std::vector<std::string>& all_tensor_types_ir10() {
+    static const std::vector<std::string> all_tensor_types_ir10 = {
+        "tensor(uint8)",      "tensor(uint16)",         "tensor(uint32)",
+        "tensor(uint64)",     "tensor(int8)",           "tensor(int16)",
+        "tensor(int32)",      "tensor(int64)",          "tensor(bfloat16)",
+        "tensor(float16)",    "tensor(float)",          "tensor(double)",
+        "tensor(string)",     "tensor(bool)",           "tensor(complex64)",
+        "tensor(complex128)", "tensor(float8e4m3fn)",   "tensor(float8e4m3fnuz)",
+        "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",
+        "tensor(int4)"};
+    return all_tensor_types_ir10;
+  }
+
   static const std::vector<std::string>& all_tensor_sequence_types() {
     static const std::vector<std::string> all_tensor_sequence_types = {
         "seq(tensor(uint8))",
@@ -807,7 +851,7 @@ class OpSchema final {
   }
 
   static const std::vector<std::string>& all_tensor_sequence_types_ir9() {
-    static const std::vector<std::string> all_tensor_sequence_types_ir4 = {
+    static const std::vector<std::string> all_tensor_sequence_types_ir9 = {
         "seq(tensor(uint8))",      "seq(tensor(uint16))",        "seq(tensor(uint32))",
         "seq(tensor(uint64))",     "seq(tensor(int8))",          "seq(tensor(int16))",
         "seq(tensor(int32))",      "seq(tensor(int64))",         "seq(tensor(bfloat16))",
@@ -815,7 +859,20 @@ class OpSchema final {
         "seq(tensor(string))",     "seq(tensor(bool))",          "seq(tensor(complex64))",
         "seq(tensor(complex128))", "seq(tensor(float8e4m3fn))",  "seq(tensor(float8e4m3fnuz))",
         "seq(tensor(float8e5m2))", "seq(tensor(float8e5m2fnuz))"};
-    return all_tensor_sequence_types_ir4;
+    return all_tensor_sequence_types_ir9;
+  }
+
+  static const std::vector<std::string>& all_tensor_sequence_types_ir10() {
+    static const std::vector<std::string> all_tensor_sequence_types_ir10 = {
+        "seq(tensor(uint8))",      "seq(tensor(uint16))",         "seq(tensor(uint32))",
+        "seq(tensor(uint64))",     "seq(tensor(int8))",           "seq(tensor(int16))",
+        "seq(tensor(int32))",      "seq(tensor(int64))",          "seq(tensor(bfloat16))",
+        "seq(tensor(float16))",    "seq(tensor(float))",          "seq(tensor(double))",
+        "seq(tensor(string))",     "seq(tensor(bool))",           "seq(tensor(complex64))",
+        "seq(tensor(complex128))", "seq(tensor(float8e4m3fn))",   "seq(tensor(float8e4m3fnuz))",
+        "seq(tensor(float8e5m2))", "seq(tensor(float8e5m2fnuz))", "seq(tensor(uint4))",
+        "seq(tensor(int4))"};
+    return all_tensor_sequence_types_ir10;
   }
 
   static const std::vector<std::string>& all_optional_types() {
@@ -866,6 +923,24 @@ class OpSchema final {
     return all_optional_types;
   }
 
+  static const std::vector<std::string>& all_optional_types_ir10() {
+    static const std::vector<std::string> all_optional_types = {
+        "optional(seq(tensor(uint8)))",      "optional(seq(tensor(uint16)))", "optional(seq(tensor(uint32)))",
+        "optional(seq(tensor(uint64)))",     "optional(seq(tensor(int8)))",   "optional(seq(tensor(int16)))",
+        "optional(seq(tensor(int32)))",      "optional(seq(tensor(int64)))",  "optional(seq(tensor(bfloat16)))",
+        "optional(seq(tensor(float16)))",    "optional(seq(tensor(float)))",  "optional(seq(tensor(double)))",
+        "optional(seq(tensor(string)))",     "optional(seq(tensor(bool)))",   "optional(seq(tensor(complex64)))",
+        "optional(seq(tensor(complex128)))", "optional(tensor(uint8))",       "optional(tensor(uint16))",
+        "optional(tensor(uint32))",          "optional(tensor(uint64))",      "optional(tensor(int8))",
+        "optional(tensor(int16))",           "optional(tensor(int32))",       "optional(tensor(int64))",
+        "optional(tensor(bfloat16))",        "optional(tensor(float16))",     "optional(tensor(float))",
+        "optional(tensor(double))",          "optional(tensor(string))",      "optional(tensor(bool))",
+        "optional(tensor(complex64))",       "optional(tensor(complex128))",  "optional(tensor(float8e4m3fn))",
+        "optional(tensor(float8e4m3fnuz))",  "optional(tensor(float8e5m2))",  "optional(tensor(float8e5m2fnuz))",
+        "optional(tensor(uint4))",           "optional(tensor(int4))"};
+    return all_optional_types;
+  }
+
   // Calls the passed function with `this` as an argument. Useful for
   // adding docs for temlated/macro ops.
   OpSchema& FillUsing(const std::function<void(OpSchema&)>& populator);
diff --git a/onnx/defs/tensor/defs.cc b/onnx/defs/tensor/defs.cc
index 278d96469f8..bddd009b68c 100644
--- a/onnx/defs/tensor/defs.cc
+++ b/onnx/defs/tensor/defs.cc
@@ -82,7 +82,7 @@ The rules then become:
 
 ONNX_OPERATOR_SET_SCHEMA(
     Cast,
-    19,
+    21,
     OpSchema()
         .SetDoc(Cast_ver19_doc)
         .Attr(
@@ -111,45 +111,19 @@ ONNX_OPERATOR_SET_SCHEMA(
             OpSchema::Differentiable)
         .TypeConstraint(
             "T1",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(bool)",
-             "tensor(string)",
-             "tensor(bfloat16)",
-             "tensor(float8e4m3fn)",
-             "tensor(float8e4m3fnuz)",
-             "tensor(float8e5m2)",
-             "tensor(float8e5m2fnuz)"},
+            {"tensor(float16)",    "tensor(float)",          "tensor(double)",       "tensor(int8)",
+             "tensor(int16)",      "tensor(int32)",          "tensor(int64)",        "tensor(uint8)",
+             "tensor(uint16)",     "tensor(uint32)",         "tensor(uint64)",       "tensor(bool)",
+             "tensor(string)",     "tensor(bfloat16)",       "tensor(float8e4m3fn)", "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",        "tensor(int4)"},
             "Constrain input types. Casting from complex is not supported.")
         .TypeConstraint(
             "T2",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(bool)",
-             "tensor(string)",
-             "tensor(bfloat16)",
-             "tensor(float8e4m3fn)",
-             "tensor(float8e4m3fnuz)",
-             "tensor(float8e5m2)",
-             "tensor(float8e5m2fnuz)"},
+            {"tensor(float16)",    "tensor(float)",          "tensor(double)",       "tensor(int8)",
+             "tensor(int16)",      "tensor(int32)",          "tensor(int64)",        "tensor(uint8)",
+             "tensor(uint16)",     "tensor(uint32)",         "tensor(uint64)",       "tensor(bool)",
+             "tensor(string)",     "tensor(bfloat16)",       "tensor(float8e4m3fn)", "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",        "tensor(int4)"},
             "Constrain output types. Casting to complex is not supported.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromAttributeToOutput(ctx, "to", 0);
@@ -169,7 +143,7 @@ See documentation of the Cast operator for further details.
 
 ONNX_OPERATOR_SET_SCHEMA(
     CastLike,
-    19,
+    21,
     OpSchema()
         .SetDoc(CastLike_ver19_doc)
         .Attr(
@@ -201,45 +175,19 @@ ONNX_OPERATOR_SET_SCHEMA(
             OpSchema::Differentiable)
         .TypeConstraint(
             "T1",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(bool)",
-             "tensor(string)",
-             "tensor(bfloat16)",
-             "tensor(float8e4m3fn)",
-             "tensor(float8e4m3fnuz)",
-             "tensor(float8e5m2)",
-             "tensor(float8e5m2fnuz)"},
+            {"tensor(float16)",    "tensor(float)",          "tensor(double)",       "tensor(int8)",
+             "tensor(int16)",      "tensor(int32)",          "tensor(int64)",        "tensor(uint8)",
+             "tensor(uint16)",     "tensor(uint32)",         "tensor(uint64)",       "tensor(bool)",
+             "tensor(string)",     "tensor(bfloat16)",       "tensor(float8e4m3fn)", "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",        "tensor(int4)"},
             "Constrain input types. Casting from complex is not supported.")
         .TypeConstraint(
             "T2",
-            {"tensor(float16)",
-             "tensor(float)",
-             "tensor(double)",
-             "tensor(int8)",
-             "tensor(int16)",
-             "tensor(int32)",
-             "tensor(int64)",
-             "tensor(uint8)",
-             "tensor(uint16)",
-             "tensor(uint32)",
-             "tensor(uint64)",
-             "tensor(bool)",
-             "tensor(string)",
-             "tensor(bfloat16)",
-             "tensor(float8e4m3fn)",
-             "tensor(float8e4m3fnuz)",
-             "tensor(float8e5m2)",
-             "tensor(float8e5m2fnuz)"},
+            {"tensor(float16)",    "tensor(float)",          "tensor(double)",       "tensor(int8)",
+             "tensor(int16)",      "tensor(int32)",          "tensor(int64)",        "tensor(uint8)",
+             "tensor(uint16)",     "tensor(uint32)",         "tensor(uint64)",       "tensor(bool)",
+             "tensor(string)",     "tensor(bfloat16)",       "tensor(float8e4m3fn)", "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)", "tensor(float8e5m2fnuz)", "tensor(uint4)",        "tensor(int4)"},
             "Constrain output types. Casting to complex is not supported.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 1, 0);
@@ -281,7 +229,7 @@ to -1 cannot be determined uniquely.
 
 ONNX_OPERATOR_SET_SCHEMA(
     Reshape,
-    19,
+    21,
     OpSchema()
         .SetDoc(Reshape_ver19_doc)
         .Attr(
@@ -303,7 +251,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             1,
             OpSchema::NonDifferentiable)
         .Output(0, "reshaped", "Reshaped data.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Constrain input and output types to all tensor types.")
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir10(), "Constrain input and output types to all tensor types.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           // Type inference
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
@@ -470,7 +418,7 @@ Output: [3]
 
 ONNX_OPERATOR_SET_SCHEMA(
     Shape,
-    19,
+    21,
     OpSchema()
         .SetDoc(Shape_ver19_doc)
         .Input(0, "data", "An input tensor.", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
@@ -488,7 +436,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             "If omitted, sizes of all axes upto (including) the last one will be included.",
             AttributeProto::INT,
             OPTIONAL_VALUE)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Input tensor can be of arbitrary type.")
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir10(), "Input tensor can be of arbitrary type.")
         .TypeConstraint("T1", {"tensor(int64)"}, "Constrain output to int64 tensor.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           ctx.getOutputType(0)->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
@@ -536,7 +484,7 @@ Takes a tensor as input and outputs a int64 scalar that equals to the total numb
 
 ONNX_OPERATOR_SET_SCHEMA(
     Size,
-    19,
+    21,
     OpSchema()
         .SetDoc(Size_ver19_doc)
         .Input(0, "data", "An input tensor.", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
@@ -549,7 +497,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::NonDifferentiable)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Input tensor can be of arbitrary type.")
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir10(), "Input tensor can be of arbitrary type.")
         .TypeConstraint("T1", {"tensor(int64)"}, "Constrain output to int64 tensor, which should be a scalar though.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           ctx.getOutputType(0)->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
@@ -1133,7 +1081,7 @@ will be (2, 1, 3).
 
 ONNX_OPERATOR_SET_SCHEMA(
     Transpose,
-    13,
+    21,
     OpSchema()
         .SetDoc(Transpose_ver13_doc)
         .Attr(
@@ -1144,7 +1092,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             OPTIONAL_VALUE)
         .Input(0, "data", "An input tensor.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
         .Output(0, "transposed", "Transposed output.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.")
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir10(), "Constrain input and output types to all tensor types.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
           if (!hasNInputShapes(ctx, 1)) {
@@ -1760,7 +1708,7 @@ the shape. If an axis is selected with shape entry not equal to one, an error is
 
 ONNX_OPERATOR_SET_SCHEMA(
     Squeeze,
-    13,
+    21,
     OpSchema()
         .SetDoc(Squeeze_ver13_doc)
         .Input(
@@ -1791,7 +1739,10 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.")
+        .TypeConstraint(
+            "T",
+            OpSchema::all_tensor_types_ir10(),
+            "Constrain input and output types to all tensor types up to IRv10.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
           if (!hasNInputShapes(ctx, 1)) {
@@ -1866,7 +1817,7 @@ The order of values in `axes` does not matter and can come in any order.
 
 ONNX_OPERATOR_SET_SCHEMA(
     Unsqueeze,
-    13,
+    21,
     OpSchema()
         .SetDoc(Unsqueeze_ver13_doc)
         .Input(0, "data", "Original tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
@@ -1889,7 +1840,10 @@ ONNX_OPERATOR_SET_SCHEMA(
             true,
             1,
             OpSchema::Differentiable)
-        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.")
+        .TypeConstraint(
+            "T",
+            OpSchema::all_tensor_types_ir10(),
+            "Constrain input and output types to all tensor types up to IRv10.")
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
           propagateElemTypeFromInputToOutput(ctx, 0, 0);
           if (!hasNInputShapes(ctx, 1)) {
@@ -2750,7 +2704,7 @@ ONNX_OPERATOR_SET_SCHEMA(
 
 ONNX_OPERATOR_SET_SCHEMA(
     Identity,
-    19,
+    21,
     OpSchema()
         .SetDoc("Identity operator")
         .Input(0, "input", "Input tensor", "V", OpSchema::Single, true, 1, OpSchema::Differentiable)
@@ -2758,7 +2712,7 @@ ONNX_OPERATOR_SET_SCHEMA(
         .TypeConstraint(
             "V",
             []() {
-              auto t = OpSchema::all_tensor_types_ir9();
+              auto t = OpSchema::all_tensor_types_ir10();
               auto s = OpSchema::all_tensor_sequence_types();
               auto o = OpSchema::all_optional_types();
               t.insert(t.end(), s.begin(), s.end());
@@ -3711,9 +3665,12 @@ output = [
 
 ONNX_OPERATOR_SET_SCHEMA(
     Pad,
-    19,
-    OpSchema().FillUsing(
-        PadDocGenerator(Pad_ver19_doc, "Supported modes: `constant`(default), `reflect`, `edge`, `wrap`")));
+    21,
+    OpSchema().FillUsing(PadDocGenerator(
+        Pad_ver19_doc,
+        "Supported modes: `constant`(default), `reflect`, `edge`, `wrap`",
+        OpSchema::all_tensor_types_ir10(),
+        "Constrain input and output types to all tensor types up to IRv10.")));
 
 static const char* Trilu_ver14_doc = R"DOC(
 Given a 2-D matrix or batches of 2-D matrices, returns the upper or lower triangular part of the tensor(s).
diff --git a/onnx/defs/tensor/old.cc b/onnx/defs/tensor/old.cc
index 0f48f9690c5..6e47b7f4279 100644
--- a/onnx/defs/tensor/old.cc
+++ b/onnx/defs/tensor/old.cc
@@ -12,6 +12,154 @@
 
 namespace ONNX_NAMESPACE {
 
+static const char* Cast_ver19_doc = R"DOC(
+The operator casts the elements of a given input tensor to a data type
+specified by the 'to' argument and returns an output tensor of the same size in
+the converted type. The 'to' argument must be one of the data types specified
+in the 'DataType' enum field in the TensorProto message.
+
+Casting from string tensor in plain (e.g., "3.14" and "1000") and scientific numeric representations
+(e.g., "1e-5" and "1E8") to float types is supported. For example, converting string "100.5" to an integer may
+yield result 100. There are some string literals reserved for special floating-point values;
+"+INF" (and "INF"), "-INF", and "NaN" are positive infinity, negative infinity, and not-a-number, respectively.
+Any string which can exactly match "+INF" in a case-insensitive way would be mapped to positive infinite. Similarly,
+this case-insensitive rule is applied to "INF" and "NaN". When casting from numeric tensors
+to string tensors, plain floating-point representation (such as "314.15926") would be used.
+Converting non-numerical-literal string such as "Hello World!" is an undefined behavior. Cases
+of converting string representing floating-point arithmetic value, such as "2.718", to INT is an undefined behavior.
+
+Conversion from a numerical type to any numerical type is always allowed.
+User must be aware of precision loss and value change caused by range difference between two types.
+For example, a 64-bit float 3.1415926459 may be round to a 32-bit float 3.141592. Similarly, converting
+an integer 36 to Boolean may produce 1 because we truncate bits which can't be stored in the targeted type.
+
+In more detail, the conversion among numerical types should follow these rules
+if the destination type is not a float 8 type.
+
+* Casting from floating point to:
+  * floating point: +/- infinity if OOR (out of range).
+  * fixed point: undefined if OOR.
+  * bool: +/- 0.0 to False; all else to True.
+* Casting from fixed point to:
+  * floating point: +/- infinity if OOR. (+ infinity in the case of uint)
+  * fixed point: when OOR, discard higher bits and reinterpret (with respect to two's complement representation for
+    signed types). For example, 200 (int16) -> -56 (int8).
+  * bool: zero to False; nonzero to True.
+* Casting from bool to:
+  * floating point: `{1.0, 0.0}`.
+  * fixed point: `{1, 0}`.
+  * bool: no change.
+
+Float 8 type were introduced to speed up the training of
+deep models. By default the conversion of a float *x* obeys
+to the following rules. `[x]` means the value rounded to
+the target mantissa width.
+
+| x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
+|------|----|----|----|----|
+| 0 | 0 | 0 | 0 | 0 |
+|-0 | -0 | 0 | -0 | 0 |
+| NaN | NaN | NaN | NaN | NaN |
+| +/- Inf | +/- FLT_MAX | NaN | FLT_MAX | NaN |
+| [x] > FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX | FLT_MAX |
+| [x] < -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
+| else | RNE | RNE | RNE | RNE |
+
+The behavior changes if the parameter 'saturate' is set to False.
+The rules then become:
+
+| x | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
+|------|----|----|----|----|
+| 0 | 0 | 0 | 0 | 0 |
+|-0 | -0 | 0 | -0 | 0 |
+| NaN | NaN | NaN | NaN | NaN |
+| +/- Inf | NaN | NaN | +/- Inf | NaN |
+| [x] > FLT_MAX | NaN | NaN | Inf | NaN |
+| [x] < -FLT_MAX | NaN | NaN | -Inf | NaN |
+| else | RNE | RNE | RNE | RNE |
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Cast,
+    19,
+    OpSchema()
+        .SetDoc(Cast_ver19_doc)
+        .Attr(
+            "to",
+            "The data type to which the elements of the input tensor are cast. "
+            "Strictly must be one of the types from DataType enum in TensorProto",
+            AttributeProto::INT)
+        .Attr(
+            "saturate",
+            "The parameter defines how the conversion behaves if an input value is out of "
+            "range of the destination type. It only applies for float 8 conversion "
+            "(float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. "
+            "All cases are fully described in two tables inserted in the operator description.",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
+        .Input(0, "input", "Input tensor to be cast.", "T1", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(
+            0,
+            "output",
+            "Output tensor with the same shape as input with type "
+            "specified by the 'to' argument",
+            "T2",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(bool)",
+             "tensor(string)",
+             "tensor(bfloat16)",
+             "tensor(float8e4m3fn)",
+             "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)",
+             "tensor(float8e5m2fnuz)"},
+            "Constrain input types. Casting from complex is not supported.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(bool)",
+             "tensor(string)",
+             "tensor(bfloat16)",
+             "tensor(float8e4m3fn)",
+             "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)",
+             "tensor(float8e5m2fnuz)"},
+            "Constrain output types. Casting to complex is not supported.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromAttributeToOutput(ctx, "to", 0);
+          if (hasNInputShapes(ctx, 1)) {
+            propagateShapeFromInputToOutput(ctx, 0, 0);
+          }
+        })
+        .PartialDataPropagationFunction([](DataPropagationContext& ctx) {
+          PropagateShapeDataFromInputToOutput(ctx, 0);
+        }));
+
 static const char* Cast_ver13_doc = R"DOC(
 The operator casts the elements of a given input tensor to a data type
 specified by the 'to' argument and returns an output tensor of the same size in
@@ -115,6 +263,108 @@ ONNX_OPERATOR_SET_SCHEMA(
           PropagateShapeDataFromInputToOutput(ctx, 0);
         }));
 
+static const char* CastLike_ver19_doc = R"DOC(
+The operator casts the elements of a given input tensor (the first input) to
+the same data type as the elements of the second input tensor.
+See documentation of the Cast operator for further details.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    CastLike,
+    19,
+    OpSchema()
+        .SetDoc(CastLike_ver19_doc)
+        .Attr(
+            "saturate",
+            "The parameter defines how the conversion behaves if an input value is out of "
+            "range of the destination type. It only applies for float 8 conversion "
+            "(float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. "
+            "Please refer to operator Cast description for further details.",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
+        .Input(0, "input", "Input tensor to be cast.", "T1", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Input(
+            1,
+            "target_type",
+            "The (first) input tensor will be cast to produce a tensor of the same type as this (second input) tensor.",
+            "T2",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(
+            0,
+            "output",
+            "Output tensor produced by casting the first input tensor to have the same type as the second input tensor.",
+            "T2",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint(
+            "T1",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(bool)",
+             "tensor(string)",
+             "tensor(bfloat16)",
+             "tensor(float8e4m3fn)",
+             "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)",
+             "tensor(float8e5m2fnuz)"},
+            "Constrain input types. Casting from complex is not supported.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)",
+             "tensor(int8)",
+             "tensor(int16)",
+             "tensor(int32)",
+             "tensor(int64)",
+             "tensor(uint8)",
+             "tensor(uint16)",
+             "tensor(uint32)",
+             "tensor(uint64)",
+             "tensor(bool)",
+             "tensor(string)",
+             "tensor(bfloat16)",
+             "tensor(float8e4m3fn)",
+             "tensor(float8e4m3fnuz)",
+             "tensor(float8e5m2)",
+             "tensor(float8e5m2fnuz)"},
+            "Constrain output types. Casting to complex is not supported.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 1, 0);
+          if (hasNInputShapes(ctx, 1)) {
+            propagateShapeFromInputToOutput(ctx, 0, 0);
+          }
+        })
+        .SetContextDependentFunctionBodyBuilder(
+            [](const FunctionBodyBuildContext& ctx, const OpSchema& schema, FunctionProto& functionProto) -> bool {
+              auto target_type = ctx.getInputType(1);
+              if ((target_type == nullptr) || (!target_type->has_tensor_type())) {
+                // we cannot create a correct function body without knowing the target element type
+                return false;
+              }
+              auto target_elt_type = target_type->tensor_type().elem_type();
+              FunctionBuilder builder(functionProto);
+              builder.Add(
+                  MakeString("output = Cast <to= ", (int64_t)(target_elt_type), ", saturate: int = @saturate> (input)")
+                      .c_str());
+              schema.BuildFunction(functionProto);
+              return true;
+            }));
+
 static const char* CastLike_ver15_doc = R"DOC(
 The operator casts the elements of a given input tensor (the first input) to
 the same data type as the elements of the second input tensor.
@@ -364,6 +614,170 @@ ONNX_OPERATOR_SET_SCHEMA(
         .SetDoc(GridSample_ver16_doc)
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { gridSampleShapeInference(ctx); }));
 
+static const char* Reshape_ver19_doc = R"DOC(
+Reshape the input tensor similar to numpy.reshape.
+First input is the data tensor, second input is a shape tensor which specifies the output shape. It outputs the reshaped tensor.
+At most one dimension of the new shape can be -1. In this case, the value is
+inferred from the size of the tensor and the remaining dimensions. A dimension
+could also be 0, in which case the actual dimension value is unchanged (i.e. taken
+from the input tensor). If 'allowzero' is set, and the new shape includes 0, the
+dimension will be set explicitly to zero (i.e. not taken from input tensor).
+Shape (second input) could be an empty shape, which means converting to a scalar.
+The input tensor's shape and the output tensor's shape are required to have the same number of elements.
+
+If the attribute 'allowzero' is set, it is invalid for the specified shape to
+contain both a zero value and -1, as the value of the dimension corresponding
+to -1 cannot be determined uniquely.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Reshape,
+    19,
+    OpSchema()
+        .SetDoc(Reshape_ver19_doc)
+        .Attr(
+            "allowzero",
+            "(Optional) By default, when any value in the 'shape' input is equal to zero "
+            "the corresponding dimension value is copied from the input tensor dynamically. "
+            "allowzero=1 indicates that if any value in the 'shape' input is set to zero, "
+            "the zero value is honored, similar to NumPy.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Input(0, "data", "An input tensor.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Input(
+            1,
+            "shape",
+            "Specified shape for output.",
+            "tensor(int64)",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(0, "reshaped", "Reshaped data.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Constrain input and output types to all tensor types.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          // Type inference
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          bool found;
+          TensorShapeProto targetShapeProto = getShapeInput(ctx, 1, found);
+          if (!found) {
+            return;
+          }
+
+          int allowzero = static_cast<int>(getAttribute(ctx, "allowzero", 0));
+
+          // Iterate through targetShape, adding dimensions in the outputShape
+          // TensorProto. If the targetShape dimension is -1, we do not set the
+          // dimension value in this iteration, but we record the Dimension. If
+          // targetShape dimension is 0, we attempt to propagate the dimension
+          // value/param. If the value cannot be inferred, we set the flag in
+          // the unresolveZeros vector. If targetShape dimension is positive, we
+          // set the dimension value in the outputShape. We track the product of
+          // the dimensions we are setting outputShape in the outputProduct
+          // variable. The outputProduct will potentially be used for inferring
+          // a dimension marked -1.
+          auto* outputShape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+          TensorShapeProto::Dimension* negativeOneDim = nullptr;
+          const auto& dataInputTensorType = ctx.getInputType(0)->tensor_type();
+          std::vector<bool> unresolvedZeros(targetShapeProto.dim_size(), false);
+          int64_t outputProduct = 1;
+          bool outputProductValid = true;
+          for (int i = 0; i < static_cast<int>(targetShapeProto.dim_size()); ++i) {
+            // Add a new dimension to outputShape
+            auto* new_dim = outputShape->add_dim();
+            if (targetShapeProto.dim(i).has_dim_param()) {
+              // There is a tricky edge case here. It is possible that the value of
+              // symbolic dim can be -1 or 0 at runtime. In that case simply propgating this
+              // symbol can be erroneous. This should be a very rare scenario and in such a
+              // case an option is to turn off data propagation during shape inference.
+              new_dim->set_dim_param(targetShapeProto.dim(i).dim_param());
+              outputProductValid = false;
+            } else {
+              if (!targetShapeProto.dim(i).has_dim_value()) {
+                outputProductValid = false;
+                // treat this dim as unknown dim
+                continue;
+              }
+
+              const auto dim_value = targetShapeProto.dim(i).dim_value();
+
+              if (dim_value == -1) {
+                // Check if multiple -1's. If not, set negativeOneDim, marking
+                // this dimension to potentially be filled in later.
+                if (negativeOneDim) {
+                  fail_shape_inference("Target shape may not have multiple -1 dimensions.");
+                }
+                negativeOneDim = new_dim;
+              } else if (dim_value == 0) {
+                // Check if data input has a shape and if the index i is within
+                // its bounds. If these conditions are satisfied, any dimension
+                // value/param should be propagated. If dimension value cannot be
+                // inferred, set the corresponding  unresolvedZeros flag to true.
+                // If allowzero is set however, do not propagate values, since output
+                // dimension is explicitly zero.
+                if (allowzero == 0) {
+                  unresolvedZeros[i] = true;
+                  if (dataInputTensorType.has_shape()) {
+                    if (i >= dataInputTensorType.shape().dim_size()) {
+                      fail_shape_inference("Invalid position of 0.");
+                    }
+                    if (dataInputTensorType.shape().dim(i).has_dim_value()) {
+                      const auto& input_dim_value = dataInputTensorType.shape().dim(i).dim_value();
+                      new_dim->set_dim_value(input_dim_value);
+                      outputProduct *= input_dim_value;
+                      unresolvedZeros[i] = false;
+                    } else if (dataInputTensorType.shape().dim(i).has_dim_param()) {
+                      new_dim->set_dim_param(dataInputTensorType.shape().dim(i).dim_param());
+                    }
+                  }
+                } else {
+                  new_dim->set_dim_value(dim_value);
+                  outputProduct *= dim_value;
+                }
+              } else if (dim_value > 0) {
+                // Set the dimension value to dim_value
+                new_dim->set_dim_value(dim_value);
+                outputProduct *= dim_value;
+              } else {
+                // Check if value is less than -1; fail if so
+                fail_shape_inference("Invalid dimension value: ", dim_value);
+              }
+            }
+          }
+          // If negativeOneDim has been set, we attempt to infer its value. This
+          // can be done if all dimension values for the data input tensor shape
+          // are known other than the ones corresponding to unresolvedZeros
+          // flags.
+          if (negativeOneDim && outputProductValid) {
+            // First, attempt to compute product of data input shape dimensions
+            // that are not marked by unresolvedZeros. If not possible, set the
+            // inputProductValid flag to false.
+            if (!outputProduct) {
+              fail_shape_inference("Invalid Target shape product of 0. Product cannot be 0 in combination with -1");
+            }
+            int64_t inputProduct = 1;
+            bool inputProductValid = true;
+            if (!dataInputTensorType.has_shape()) {
+              inputProductValid = false;
+            } else {
+              for (int i = 0; i < dataInputTensorType.shape().dim_size(); ++i) {
+                if (dataInputTensorType.shape().dim(i).has_dim_value()) {
+                  inputProduct *= dataInputTensorType.shape().dim(i).dim_value();
+                } else if (i >= static_cast<int>(unresolvedZeros.size()) || !unresolvedZeros[i]) {
+                  inputProductValid = false;
+                  break;
+                }
+              }
+            }
+            if (inputProductValid) {
+              if (inputProduct % outputProduct != 0) {
+                fail_shape_inference("Dimension could not be inferred: incompatible shapes");
+              }
+              negativeOneDim->set_dim_value(inputProduct / outputProduct);
+            }
+          }
+        }));
+
 static const char* Reshape_ver13_doc = R"DOC(
 Reshape the input tensor similar to numpy.reshape.
 First input is the data tensor, second input is a shape tensor which specifies the output shape. It outputs the reshaped tensor.
@@ -1190,6 +1604,76 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
+static const char* Transpose_ver13_doc = R"DOC(
+Transpose the input tensor similar to numpy.transpose. For example, when
+perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
+will be (2, 1, 3).
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Transpose,
+    13,
+    OpSchema()
+        .SetDoc(Transpose_ver13_doc)
+        .Attr(
+            "perm",
+            "A list of integers. By default, reverse the dimensions, "
+            "otherwise permute the axes according to the values given.",
+            AttributeProto::INTS,
+            OPTIONAL_VALUE)
+        .Input(0, "data", "An input tensor.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "transposed", "Transposed output.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (!hasNInputShapes(ctx, 1)) {
+            return;
+          }
+          auto input_type = ctx.getInputType(0);
+          const TensorShapeProto& shape = input_type->tensor_type().shape();
+          std::vector<int64_t> perm;
+          bool has_perm_attr = getRepeatedAttribute(ctx, "perm", perm);
+          if (!has_perm_attr) {
+            perm.reserve(shape.dim_size());
+            for (int i = shape.dim_size() - 1; i >= 0; --i)
+              perm.push_back(i);
+          } else if (!perm.empty()) {
+            // check if every index is valid
+            std::vector<bool> seen(shape.dim_size(), false);
+            for (int64_t fromDimIndex : perm) {
+              if (!(0 <= fromDimIndex && fromDimIndex < shape.dim_size())) {
+                std::ostringstream oss;
+                oss << "Invalid attribute perm {" << perm[0];
+                for (size_t i = 1; i != perm.size(); ++i) {
+                  oss << ", " << perm[i];
+                }
+                oss << "}, input shape = {";
+                if (shape.dim_size() > 0) {
+                  oss << shape.dim(0).dim_value();
+                  for (int i = 1; i != shape.dim_size(); ++i) {
+                    oss << ", " << shape.dim(i).dim_value();
+                  }
+                  oss << "}";
+                }
+                fail_type_inference(oss.str());
+              } else {
+                // check if any perm is repeated
+                if (seen[fromDimIndex]) {
+                  fail_type_inference("Attribute perm for Transpose has repeated value: ", fromDimIndex);
+                }
+                seen[fromDimIndex] = true;
+              }
+            }
+          }
+
+          getOutputShape(ctx, 0);
+
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          for (size_t i = 0; i < perm.size(); ++i) {
+            appendSingleDimCopiedFromInputTypeToOutputType(ctx, 0, 0, static_cast<size_t>(perm[i]));
+          }
+        }));
+
 static const char* Transpose_ver1_doc = R"DOC(
 Transpose the input tensor similar to numpy.transpose. For example, when
 perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
@@ -2043,6 +2527,106 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
+static const char* Squeeze_ver13_doc = R"DOC(
+Remove single-dimensional entries from the shape of a tensor.
+Takes an input `axes` with a list of axes to squeeze.
+If `axes` is not provided, all the single dimensions will be removed from
+the shape. If an axis is selected with shape entry not equal to one, an error is raised.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Squeeze,
+    13,
+    OpSchema()
+        .SetDoc(Squeeze_ver13_doc)
+        .Input(
+            0,
+            "data",
+            "Tensors with at least max(dims) dimensions.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .Input(
+            1,
+            "axes",
+            "List of integers indicating the dimensions to squeeze. Negative value means counting dimensions "
+            "from the back. Accepted range is [-r, r-1] where r = rank(data).",
+            "tensor(int64)",
+            OpSchema::Optional,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(
+            0,
+            "squeezed",
+            "Reshaped tensor with same data as input.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (!hasNInputShapes(ctx, 1)) {
+            return;
+          }
+
+          std::vector<int64_t> axes;
+          size_t num_inputs = ctx.getNumInputs();
+          bool axes_not_specified = false;
+
+          if ((num_inputs == 2) && ctx.getInputType(1)) { //'axes' is input
+            auto axes_proto = ctx.getInputData(1);
+            if (axes_proto == nullptr) {
+              // skip if axes is not an initializer
+              return;
+            }
+            axes = ParseData<int64_t>(axes_proto);
+          } else {
+            // axes not specified
+            axes_not_specified = true;
+          }
+
+          const auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+          const auto input_ndim = input_shape.dim_size();
+          checkAxesRange(axes, input_ndim);
+          adjustNegativeAxes(axes, input_ndim);
+
+          for (int i = 0; i < input_ndim; ++i) {
+            if (!input_shape.dim(i).has_dim_value() && axes_not_specified) {
+              // if dim has a symbolic value and the axes spec want to act on all dims,
+              // return early because we can't infer the shape
+              return;
+            }
+          }
+
+          ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+          for (int i = 0; i < input_ndim; ++i) {
+            if (axes_not_specified && input_shape.dim(i).dim_value() == 1) {
+              // if axes not specified, do not keep shape if the dimension is equal to one
+              continue;
+            } else if (!axes_not_specified && std::find(axes.begin(), axes.end(), i) != axes.end()) {
+              // if axes wants to explicitly act on this dim, fail explicitly only if the
+              // dim is numerical and != 1. If the dim is 1 or symbolic, remove it. If
+              // the dim is symbolic, runtime engines should check that the dimension is
+              // actually 1 when the op is evaluated
+              if (input_shape.dim(i).has_dim_value() && input_shape.dim(i).dim_value() != 1) {
+                fail_shape_inference(
+                    "Dimension of input ", i, " must be 1 instead of ", input_shape.dim(i).dim_value());
+              }
+            } else {
+              *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim() = input_shape.dim(i);
+            }
+          }
+        })
+        .PartialDataPropagationFunction([](DataPropagationContext& ctx) {
+          PropagateShapeDataFromInputToOutput(ctx, 0);
+        }));
+
 static const char* Squeeze_ver11_doc = R"DOC(
 Remove single-dimensional entries from the shape of a tensor.
 Takes a  parameter `axes` with a list of axes to squeeze.
@@ -2069,33 +2653,114 @@ ONNX_OPERATOR_SET_SCHEMA(
           if (!hasNInputShapes(ctx, 1)) {
             return;
           }
-
+
+          std::vector<int64_t> axes;
+          if (!getRepeatedAttribute(ctx, "axes", axes)) {
+            return;
+          }
+
+          if (!ctx.getInputType(0)->tensor_type().has_shape()) {
+            return;
+          }
+
+          ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+          const auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+          const auto input_ndim = input_shape.dim_size();
+          std::transform(axes.begin(), axes.end(), axes.begin(), [&](int64_t axis) -> int64_t {
+            return axis < 0 ? axis + input_ndim : axis;
+          });
+
+          for (int i = 0; i < input_ndim; ++i) {
+            if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+              if (input_shape.dim(i).has_dim_value() && input_shape.dim(i).dim_value() != 1) {
+                fail_shape_inference(
+                    "Dimension of input ", i, " must be 1 instead of ", input_shape.dim(i).dim_value());
+              }
+            } else {
+              *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim() = input_shape.dim(i);
+            }
+          }
+        }));
+
+static const char* Unsqueeze_ver13_doc = R"DOC(
+Insert single-dimensional entries to the shape of an input tensor (`data`).
+Takes one required input `axes` - which contains a list of dimension indices and this operator will insert a dimension of value `1` into the corresponding index of the output tensor (`expanded`).
+
+For example, given an input tensor (`data`) of shape [3, 4, 5], then
+Unsqueeze(data, axes=[0, 4]) outputs a tensor (`expanded`) containing same data as `data` but with shape [1, 3, 4, 5, 1].
+
+The input `axes` should not contain any duplicate entries. It is an error if it contains duplicates.
+The rank of the output tensor (`output_rank`) is the rank of the input tensor (`data`) plus the number of values in `axes`.
+Each value in `axes` should be within the (inclusive) range [-output_rank , output_rank - 1].
+The order of values in `axes` does not matter and can come in any order.
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Unsqueeze,
+    13,
+    OpSchema()
+        .SetDoc(Unsqueeze_ver13_doc)
+        .Input(0, "data", "Original tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Input(
+            1,
+            "axes",
+            "List of integers indicating the dimensions to be inserted. Negative value means counting dimensions "
+            "from the back. Accepted range is [-r, r-1] where r = rank(expanded).",
+            "tensor(int64)",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .Output(
+            0,
+            "expanded",
+            "Reshaped tensor with same data as input.",
+            "T",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::Differentiable)
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          if (!hasNInputShapes(ctx, 1)) {
+            return;
+          }
           std::vector<int64_t> axes;
-          if (!getRepeatedAttribute(ctx, "axes", axes)) {
-            return;
-          }
-
-          if (!ctx.getInputType(0)->tensor_type().has_shape()) {
+          auto axes_proto = ctx.getInputData(1);
+          if (axes_proto == nullptr) {
+            // skip if axes is not an initializer
             return;
           }
-
+          axes = ParseData<int64_t>(axes_proto);
           ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
           const auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
           const auto input_ndim = input_shape.dim_size();
-          std::transform(axes.begin(), axes.end(), axes.begin(), [&](int64_t axis) -> int64_t {
-            return axis < 0 ? axis + input_ndim : axis;
-          });
+          const auto output_ndim = input_ndim + static_cast<int>(axes.size());
+          checkAxesRange(axes, output_ndim);
+          adjustNegativeAxes(axes, output_ndim);
+          checkDuplicateAxes(axes, output_ndim);
+          // sort after correcting negative axes values (if any)
+          std::sort(axes.begin(), axes.end());
 
+          int j = 0;
           for (int i = 0; i < input_ndim; ++i) {
-            if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
-              if (input_shape.dim(i).has_dim_value() && input_shape.dim(i).dim_value() != 1) {
-                fail_shape_inference(
-                    "Dimension of input ", i, " must be 1 instead of ", input_shape.dim(i).dim_value());
-              }
-            } else {
-              *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim() = input_shape.dim(i);
+            while (static_cast<size_t>(j) < axes.size() &&
+                   axes[j] == ctx.getOutputType(0)->tensor_type().shape().dim_size()) {
+              ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+              ++j;
             }
+            *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim() =
+                ctx.getInputType(0)->tensor_type().shape().dim(i);
+          }
+          while (static_cast<size_t>(j) < axes.size() &&
+                 axes[j] == ctx.getOutputType(0)->tensor_type().shape().dim_size()) {
+            ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(1);
+            ++j;
           }
+        })
+        .PartialDataPropagationFunction([](DataPropagationContext& ctx) {
+          PropagateShapeDataFromInputToOutput(ctx, 0);
         }));
 
 static const char* Unsqueeze_ver11_doc = R"DOC(
@@ -2740,6 +3405,26 @@ ONNX_OPERATOR_SET_SCHEMA(
         .SetDoc(Resize_ver11_doc)
         .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { resizeShapeInference_opset11_to_12(ctx); }));
 
+ONNX_OPERATOR_SET_SCHEMA(
+    Identity,
+    19,
+    OpSchema()
+        .SetDoc("Identity operator")
+        .Input(0, "input", "Input tensor", "V", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .Output(0, "output", "Tensor to copy input into.", "V", OpSchema::Single, true, 1, OpSchema::Differentiable)
+        .TypeConstraint(
+            "V",
+            []() {
+              auto t = OpSchema::all_tensor_types_ir9();
+              auto s = OpSchema::all_tensor_sequence_types();
+              auto o = OpSchema::all_optional_types();
+              t.insert(t.end(), s.begin(), s.end());
+              t.insert(t.end(), o.begin(), o.end());
+              return t;
+            }(),
+            "Constrain input and output types to all tensor, sequence, and optional types.")
+        .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
+
 ONNX_OPERATOR_SET_SCHEMA(
     Identity,
     13,
@@ -3015,6 +3700,115 @@ ONNX_OPERATOR_SET_SCHEMA(
           }
         }));
 
+static const char* Pad_ver19_doc = R"DOC(
+Given a tensor containing the data to be padded (`data`), a tensor containing the number of start and end pad values for axis (`pads`), (optionally) a `mode`, and (optionally) `constant_value`,
+a padded tensor (`output`) is generated.
+
+The three supported `modes` are (similar to corresponding modes supported by `numpy.pad`):
+
+1) `constant`(default) - pads with a given constant value as specified by `constant_value` (which defaults to 0, empty string, or False)
+
+2) `reflect` - pads with the reflection of the vector mirrored on the first and last values of the vector along each axis
+
+3) `edge` - pads with the edge values of array
+
+4) `wrap` - wrap-around padding as if the data tensor forms a torus
+
+
+Example 1 (`constant` mode):
+
+Insert 0 pads to the beginning of the second dimension.
+
+```
+data = [
+    [1.0, 1.2],
+    [2.3, 3.4],
+    [4.5, 5.7],
+]
+
+pads = [0, 2, 0, 0]
+
+mode = 'constant'
+
+constant_value = 0.0
+
+output = [
+    [0.0, 0.0, 1.0, 1.2],
+    [0.0, 0.0, 2.3, 3.4],
+    [0.0, 0.0, 4.5, 5.7],
+]
+```
+
+Example 2 (`reflect` mode):
+
+```
+data = [
+    [1.0, 1.2],
+    [2.3, 3.4],
+    [4.5, 5.7],
+]
+
+pads = [0, 2, 0, 0]
+
+mode = 'reflect'
+
+output = [
+    [1.0, 1.2, 1.0, 1.2],
+    [2.3, 3.4, 2.3, 3.4],
+    [4.5, 5.7, 4.5, 5.7],
+]
+```
+
+Example 3 (`edge` mode):
+
+```
+data = [
+    [1.0, 1.2],
+    [2.3, 3.4],
+    [4.5, 5.7],
+]
+
+pads = [0, 2, 0, 0]
+
+mode = 'edge'
+
+output = [
+    [1.0, 1.0, 1.0, 1.2],
+    [2.3, 2.3, 2.3, 3.4],
+    [4.5, 4.5, 4.5, 5.7],
+]
+```
+
+Example 4 (`wrap` mode):
+
+```
+data = [
+    [1.0, 1.2],
+    [2.3, 3.4],
+    [4.5, 5.7],
+]
+
+pads = [2, 1, 1, 1]
+
+mode = 'wrap'
+
+output = [
+    [3.4, 2.3, 3.4, 2.3],
+    [5.7, 4.5, 5.7, 4.5],
+    [1.2, 1.0, 1.2, 1.0],
+    [3.4, 2.3, 3.4, 2.3],
+    [5.7, 4.5, 5.7, 4.5],
+    [1.2, 1.0, 1.2, 1.0],
+]
+```
+)DOC";
+
+ONNX_OPERATOR_SET_SCHEMA(
+    Pad,
+    19,
+    OpSchema().FillUsing(
+        PadDocGenerator(Pad_ver19_doc, "Supported modes: `constant`(default), `reflect`, `edge`, `wrap`")));
+
 static const char* Pad_ver11_doc = R"DOC(
 Given a tensor containing the data to be padded (`data`), a tensor containing the number of start and end pad values for axis (`pads`), (optionally) a `mode`, and (optionally) `constant_value`,
 a padded tensor (`output`) is generated.
@@ -5252,6 +6046,68 @@ Output: [3]
 ```
 )DOC";
 
+ONNX_OPERATOR_SET_SCHEMA(
+    Shape,
+    19,
+    OpSchema()
+        .SetDoc(Shape_ver15_doc)
+        .Input(0, "data", "An input tensor.", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
+        .Output(0, "shape", "Shape of the input tensor", "T1", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
+        .Attr(
+            "start",
+            "(Optional) Starting axis for slicing the shape. Default value is 0."
+            "Negative value means counting dimensions from the back.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
+        .Attr(
+            "end",
+            "(Optional) Ending axis for slicing the shape. "
+            "Negative value means counting dimensions from the back. "
+            "If omitted, sizes of all axes upto (including) the last one will be included.",
+            AttributeProto::INT,
+            OPTIONAL_VALUE)
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Input tensor can be of arbitrary type.")
+        .TypeConstraint("T1", {"tensor(int64)"}, "Constrain output to int64 tensor.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          ctx.getOutputType(0)->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
+          auto* output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+          auto* output_length = output_shape->add_dim();
+
+          if (!hasNInputShapes(ctx, 1)) {
+            return;
+          }
+
+          int64_t rank = static_cast<int64_t>(ctx.getInputType(0)->tensor_type().shape().dim_size());
+          int64_t start = getAttribute(ctx, "start", 0);
+          if (start < 0)
+            start += rank;
+          start = (start < 0) ? 0 : (start > rank) ? rank : start;
+          int64_t end = getAttribute(ctx, "end", rank);
+          if (end < 0)
+            end += rank;
+          end = (end < 0) ? 0 : (end > rank) ? rank : end;
+          output_length->set_dim_value((end - start) < 0 ? 0 : (end - start));
+        })
+        .PartialDataPropagationFunction([](DataPropagationContext& ctx) {
+          if (hasInputShape(ctx, 0)) {
+            auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+            int64_t rank = static_cast<int64_t>(input_shape.dim_size());
+            int64_t start = getAttribute(ctx, "start", 0);
+            if (start < 0)
+              start += rank;
+            start = (start < 0) ? 0 : (start > rank) ? rank : start;
+            int64_t end = getAttribute(ctx, "end", rank);
+            if (end < 0)
+              end += rank;
+            end = (end < 0) ? 0 : (end > rank) ? rank : end;
+            TensorShapeProto output_shape;
+            for (int64_t d = start; d < end; ++d) {
+              *output_shape.add_dim() = input_shape.dim(static_cast<int>(d));
+            }
+            ctx.addOutputData(0, std::move(output_shape));
+          }
+        }));
+
 ONNX_OPERATOR_SET_SCHEMA(
     Shape,
     15,
@@ -5318,6 +6174,36 @@ static const char* Size_ver13_doc = R"DOC(
 Takes a tensor as input and outputs a int64 scalar that equals to the total number of elements of the input tensor.
 )DOC";
 
+ONNX_OPERATOR_SET_SCHEMA(
+    Size,
+    19,
+    OpSchema()
+        .SetDoc(Size_ver13_doc)
+        .Input(0, "data", "An input tensor.", "T", OpSchema::Single, true, 1, OpSchema::NonDifferentiable)
+        .Output(
+            0,
+            "size",
+            "Total number of elements of the input tensor",
+            "T1",
+            OpSchema::Single,
+            true,
+            1,
+            OpSchema::NonDifferentiable)
+        .TypeConstraint("T", OpSchema::all_tensor_types_ir9(), "Input tensor can be of arbitrary type.")
+        .TypeConstraint("T1", {"tensor(int64)"}, "Constrain output to int64 tensor, which should be a scalar though.")
+        .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+          ctx.getOutputType(0)->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
+          ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+        })
+        .PartialDataPropagationFunction([](DataPropagationContext& ctx) {
+          const auto input_data = ctx.getInputData(0);
+          if (input_data != nullptr) {
+            TensorShapeProto tsp;
+            tsp.mutable_dim()->Add()->set_dim_value(input_data->dim_size());
+            ctx.addOutputData(0, std::move(tsp));
+          }
+        }));
+
 ONNX_OPERATOR_SET_SCHEMA(
     Size,
     13,
diff --git a/onnx/defs/tensor/utils.cc b/onnx/defs/tensor/utils.cc
index f538f0c7880..d431e872432 100644
--- a/onnx/defs/tensor/utils.cc
+++ b/onnx/defs/tensor/utils.cc
@@ -390,7 +390,11 @@ void resizeShapeInference_opset7_to_10(InferenceContext& ctx) {
   }
 }
 
-std::function<void(OpSchema&)> PadDocGenerator(const char* description, const char* mode_description) {
+std::function<void(OpSchema&)> PadDocGenerator(
+    const char* description,
+    const char* mode_description,
+    const std::vector<std::string> op_schema,
+    const std::string op_schema_description) {
   return [=](OpSchema& schema) {
     schema.SetDoc(description);
     schema.Attr("mode", mode_description, AttributeProto::STRING, std::string("constant"));
@@ -433,8 +437,7 @@ std::function<void(OpSchema&)> PadDocGenerator(const char* description, const ch
         OpSchema::NonDifferentiable);
 
     schema.Output(0, "output", "Tensor after padding.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable);
-    schema.TypeConstraint(
-        "T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.");
+    schema.TypeConstraint("T", op_schema, op_schema_description);
     schema.TypeConstraint("Tind", {"tensor(int32)", "tensor(int64)"}, "Constrain indices to integer types");
     schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
       // Type inference
diff --git a/onnx/defs/tensor/utils.h b/onnx/defs/tensor/utils.h
index 04bf3317dce..466ba4381bd 100644
--- a/onnx/defs/tensor/utils.h
+++ b/onnx/defs/tensor/utils.h
@@ -51,5 +51,9 @@ void KeepAspectRatioHelper(
 
 extern const char* NonZero_ver9_doc;
 
-std::function<void(OpSchema&)> PadDocGenerator(const char* description, const char* mode_description);
+std::function<void(OpSchema&)> PadDocGenerator(
+    const char* description,
+    const char* mode_description,
+    const std::vector<std::string> op_schema = OpSchema::all_tensor_types_ir4(),
+    const std::string op_schema_description = "Constrain input and output types to all tensor types.");
 } // namespace ONNX_NAMESPACE
diff --git a/onnx/helper.py b/onnx/helper.py
index 7470f5f2084..0e12dbc2207 100644
--- a/onnx/helper.py
+++ b/onnx/helper.py
@@ -42,6 +42,7 @@
     ValueInfoProto,
     defs,
     mapping,
+    subbyte,
 )
 
 VersionRowType = Union[Tuple[str, int, int, int], Tuple[str, int, int, int, int]]
@@ -617,6 +618,34 @@ def float32_to_float8e5m2(  # noqa: PLR0911
         raise NotImplementedError("fn and uz must be both False or True.")
 
 
+def pack_float32_to_4bit(
+    array: Union[np.ndarray, Sequence], signed: bool
+) -> np.ndarray:
+    """Convert an array of float32 value to a 4bit data-type and pack every two concecutive elements in a byte.
+    See :ref:`onnx-detail-int4` for technical details.
+
+    Args:
+        array: array of float to convert and pack
+        signed: Whether the 4 bit variant is signed or unsigned
+
+    Returns:
+        Packed array with size `ceil(farray.size/2)` (single dimension).
+    """
+    if not isinstance(array, np.ndarray):
+        array = np.asarray(array, dtype=np.float32)
+
+    array_flat = array.ravel()
+    is_odd_volume = np.prod(array.shape) % 2 == 1
+    if is_odd_volume:
+        array_flat = np.append(array_flat, np.array([0]))
+
+    single_func = lambda x, y: subbyte.float32x2_to_4bitx2(x, y, signed)  # noqa: E731
+    func = np.frompyfunc(single_func, 2, 1)
+
+    arr = func(array_flat[0::2], array_flat[1::2])
+    return arr.astype(np.uint8)  # type: ignore[no-any-return]
+
+
 def make_tensor(
     name: str, data_type: int, dims: Sequence[int], vals: Any, raw: bool = False
 ) -> TensorProto:
@@ -649,8 +678,7 @@ def make_tensor(
     # Check number of vals specified equals tensor size
     expected_size = 1
     if raw:
-        # NumPy doesn't have BFLOAT16. TENSOR_TYPE_TO_NP_TYPE maps it to float32,
-        # which has the wrong itemsize.
+        # NumPy doesn't have BFLOAT16. TENSOR_TYPE_MAP maps it to float32, which has the wrong itemsize.
         if data_type == TensorProto.BFLOAT16:
             expected_size = 2
         elif data_type in (
@@ -660,6 +688,9 @@ def make_tensor(
             TensorProto.FLOAT8E5M2FNUZ,
         ):
             expected_size = 1
+        # NumPy doesn't have INT4. It is packed in couples to UINT8 buffers.
+        elif data_type in (TensorProto.UINT4, TensorProto.INT4):
+            expected_size = 0.5  # type: ignore[assignment]
         else:
             expected_size = np_dtype.itemsize
 
@@ -669,9 +700,14 @@ def make_tensor(
         expected_size *= d
 
     if len(vals) != expected_size:
-        raise ValueError(
-            f"Number of values does not match tensor's size. Expected {expected_size}, but it is {len(vals)}. "
-        )
+        # padding of half a byte is acceptable for 4bit types
+        if not (
+            data_type in (TensorProto.UINT4, TensorProto.INT4)
+            and len(vals) == expected_size + 0.5
+        ):
+            raise ValueError(
+                f"Number of values does not match tensor's size. Expected {expected_size}, but it is {len(vals)}. "
+            )
 
     if raw:
         tensor.raw_data = vals
@@ -708,6 +744,17 @@ def make_tensor(
                     np.array(vals).astype(np_dtype).flatten().tolist(),
                 )
             )
+        elif data_type in (
+            TensorProto.UINT4,
+            TensorProto.INT4,
+        ):
+            signed = data_type == TensorProto.INT4
+            vals = (
+                pack_float32_to_4bit(vals, signed=signed)
+                .astype(np_dtype)
+                .flatten()
+                .tolist()
+            )
         elif data_type == TensorProto.BOOL:
             vals = np.array(vals).astype(int)
         elif data_type == TensorProto.STRING:
diff --git a/onnx/mapping.py b/onnx/mapping.py
index 8745c4ebfd0..d2c333eebf2 100644
--- a/onnx/mapping.py
+++ b/onnx/mapping.py
@@ -80,6 +80,13 @@ class TensorDtypeMap(NamedTuple):
     int(TensorProto.FLOAT8E5M2FNUZ): TensorDtypeMap(
         np.dtype("float32"), int(TensorProto.UINT8), "TensorProto.FLOAT8E5M2FNUZ"
     ),
+    # Native numpy does not support uint4/int4 so now use uint8/int8 for these types.
+    int(TensorProto.UINT4): TensorDtypeMap(
+        np.dtype("uint8"), int(TensorProto.INT32), "TensorProto.UINT4"
+    ),
+    int(TensorProto.INT4): TensorDtypeMap(
+        np.dtype("int8"), int(TensorProto.INT32), "TensorProto.INT4"
+    ),
 }
 
 
@@ -154,6 +161,8 @@ def __getitem__(self, key: Union[int, str, np.dtype]) -> Any:
         TensorProto.FLOAT8E4M3FNUZ,
         TensorProto.FLOAT8E5M2,
         TensorProto.FLOAT8E5M2FNUZ,
+        TensorProto.UINT4,
+        TensorProto.INT4,
     )
 }
 
diff --git a/onnx/numpy_helper.py b/onnx/numpy_helper.py
index 061ba013ef7..3b53c891be2 100644
--- a/onnx/numpy_helper.py
+++ b/onnx/numpy_helper.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from onnx import MapProto, OptionalProto, SequenceProto, TensorProto, helper
+from onnx import MapProto, OptionalProto, SequenceProto, TensorProto, helper, subbyte
 from onnx.external_data_helper import load_external_data_for_tensor, uses_external_data
 
 
@@ -185,6 +185,39 @@ def float8e5m2_to_float32(
     return res.reshape(dims)  # type: ignore[no-any-return]
 
 
+def unpack_int4(
+    data: Union[np.int32, np.ndarray],
+    dims: Union[int, Sequence[int]],
+    signed: bool,
+) -> np.ndarray:
+    """Converts ndarray of int4 (as packed uint8) to f32
+    See :ref:`onnx-detail-int4` for technical details.
+
+    Args:
+        data: A numpy array, empty dimensions are allowed if dims is
+            None.
+        dims: The dimensions are used to reshape the unpacked buffer
+        signed: Whether the 4 bit integer is signed or unsigned
+
+    Returns:
+        A numpy array of float32 reshaped to dims.
+    """
+    single_func = lambda x: subbyte.unpack_single_4bitx2(x, signed)  # noqa: E731
+    func = np.frompyfunc(single_func, 1, 2)
+
+    res_high, res_low = func(data.ravel())
+    res = np.empty((res_high.size + res_low.size,), dtype=np.float32)
+    res[0::2] = res_high
+    res[1::2] = res_low
+
+    if (
+        res.size == np.prod(dims) + 1
+    ):  # handle single-element padding due to odd number of elements
+        res = res.ravel()[:-1]
+    res = res.reshape(dims)
+    return res
+
+
 def to_array(tensor: TensorProto, base_dir: str = "") -> np.ndarray:  # noqa: PLR0911
     """Converts a tensor def object to a numpy array.
 
@@ -244,6 +277,14 @@ def to_array(tensor: TensorProto, base_dir: str = "") -> np.ndarray:  # noqa: PL
             data = np.frombuffer(tensor.raw_data, dtype=np.int8)
             return float8e5m2_to_float32(data, dims, fn=True, uz=True)
 
+        if tensor_dtype == TensorProto.UINT4:
+            data = np.frombuffer(tensor.raw_data, dtype=np.uint8)
+            return unpack_int4(data, dims, signed=False)
+
+        if tensor_dtype == TensorProto.INT4:
+            data = np.frombuffer(tensor.raw_data, dtype=np.int8)
+            return unpack_int4(data, dims, signed=True)
+
         return np.frombuffer(tensor.raw_data, dtype=np_dtype).reshape(dims)  # type: ignore[no-any-return]
 
     # float16 is stored as int32 (uint16 type); Need view to get the original value
@@ -275,6 +316,14 @@ def to_array(tensor: TensorProto, base_dir: str = "") -> np.ndarray:  # noqa: PL
         data = np.asarray(tensor.int32_data, dtype=np.int32)
         return float8e5m2_to_float32(data, dims, fn=True, uz=True)
 
+    if tensor_dtype == TensorProto.UINT4:
+        data = np.asarray(tensor.int32_data, dtype=storage_np_dtype)
+        return unpack_int4(data, dims, signed=False)
+
+    if tensor_dtype == TensorProto.INT4:
+        data = np.asarray(tensor.int32_data, dtype=storage_np_dtype)
+        return unpack_int4(data, dims, signed=True)
+
     data = getattr(tensor, storage_field)
     if tensor_dtype in (TensorProto.COMPLEX64, TensorProto.COMPLEX128):
         data = combine_pairs_to_complex(data)  # type: ignore[assignment,arg-type]
diff --git a/onnx/onnx-ml.proto b/onnx/onnx-ml.proto
index dc9ccf8c735..ea3a7706af6 100644
--- a/onnx/onnx-ml.proto
+++ b/onnx/onnx-ml.proto
@@ -524,6 +524,10 @@ message TensorProto {
     FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
     FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
 
+    // 4-bit data-types
+    UINT4 = 21;
+    INT4 = 22;
+
     // Future extensions go here.
   }
 
diff --git a/onnx/onnx-ml.proto3 b/onnx/onnx-ml.proto3
index 6220f45c609..100f1b42685 100644
--- a/onnx/onnx-ml.proto3
+++ b/onnx/onnx-ml.proto3
@@ -524,6 +524,10 @@ message TensorProto {
     FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
     FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
 
+    // 4-bit data-types
+    UINT4 = 21;
+    INT4 = 22;
+
     // Future extensions go here.
   }
 
diff --git a/onnx/onnx.in.proto b/onnx/onnx.in.proto
index 0b37ec316db..80584bef16b 100644
--- a/onnx/onnx.in.proto
+++ b/onnx/onnx.in.proto
@@ -521,6 +521,10 @@ message TensorProto {
     FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
     FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
 
+    // 4-bit data-types
+    UINT4 = 21;
+    INT4 = 22;
+
     // Future extensions go here.
   }
 
diff --git a/onnx/onnx.proto b/onnx/onnx.proto
index 15012ce65c3..d39302a182d 100644
--- a/onnx/onnx.proto
+++ b/onnx/onnx.proto
@@ -522,6 +522,10 @@ message TensorProto {
     FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
     FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
 
+    // 4-bit data-types
+    UINT4 = 21;
+    INT4 = 22;
+
     // Future extensions go here.
   }
 
diff --git a/onnx/onnx.proto3 b/onnx/onnx.proto3
index f47006f8c97..5984815fa13 100644
--- a/onnx/onnx.proto3
+++ b/onnx/onnx.proto3
@@ -522,6 +522,10 @@ message TensorProto {
     FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
     FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
 
+    // 4-bit data-types
+    UINT4 = 21;
+    INT4 = 22;
+
     // Future extensions go here.
   }
 
diff --git a/onnx/reference/custom_element_types.py b/onnx/reference/custom_element_types.py
index aaadffd1623..227cb2c97ab 100644
--- a/onnx/reference/custom_element_types.py
+++ b/onnx/reference/custom_element_types.py
@@ -9,3 +9,5 @@
 float8e4m3fnuz = np.dtype((np.uint8, {"e4m3fnuz": (np.uint8, 0)}))
 float8e5m2 = np.dtype((np.uint8, {"e5m2": (np.uint8, 0)}))
 float8e5m2fnuz = np.dtype((np.uint8, {"e5m2fnuz": (np.uint8, 0)}))
+uint4 = np.dtype((np.uint8, {"uint4": (np.uint8, 0)}))
+int4 = np.dtype((np.int8, {"int4": (np.int8, 0)}))
diff --git a/onnx/reference/op_run.py b/onnx/reference/op_run.py
index 74a82bb8d2d..42eb3b2c601 100644
--- a/onnx/reference/op_run.py
+++ b/onnx/reference/op_run.py
@@ -11,7 +11,7 @@
 from onnx import TensorProto
 from onnx.defs import get_all_schemas_with_history, get_schema, onnx_opset_version
 from onnx.helper import make_node, make_tensor_type_proto, np_dtype_to_tensor_dtype
-from onnx.numpy_helper import to_array
+from onnx.numpy_helper import to_array, unpack_int4
 from onnx.onnx_pb import AttributeProto, GraphProto, NodeProto, TypeProto
 from onnx.reference.custom_element_types import (
     bfloat16,
@@ -19,6 +19,8 @@
     float8e4m3fnuz,
     float8e5m2,
     float8e5m2fnuz,
+    int4,
+    uint4,
 )
 
 
@@ -119,8 +121,8 @@ def to_sparse_tensor(att: AttributeProto) -> SparseTensor:
 
 
 def to_array_extended(tensor: TensorProto) -> np.ndarray:
-    """Similar to :func:`to_array` but deals with bfloat16,
-    float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz.
+    """Similar to :func:`to_array` but deals with non-numpy types bfloat16,
+    float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz, uint4, int4.
     """
     elem_type = tensor.data_type
     if elem_type == TensorProto.BFLOAT16:
@@ -153,6 +155,21 @@ def to_array_extended(tensor: TensorProto) -> np.ndarray:
         for i, d in enumerate(data):
             y[i] = d
         return y.reshape(shape)
+    if elem_type in (TensorProto.UINT4, TensorProto.INT4):
+        if tensor.HasField("raw_data"):
+            data = tensor.raw_data  # type: ignore[assignment]
+        else:
+            data = tensor.int32_data
+        shape = tuple(tensor.dims)
+        m = {TensorProto.INT4: int4, TensorProto.UINT4: uint4}
+        dtype = m[elem_type]  # type: ignore[index]
+        signed = elem_type == TensorProto.INT4
+        y = np.empty(len(data), dtype=dtype).ravel()
+        for i, d in enumerate(data):
+            y[i] = d
+
+        unpacked_data = unpack_int4(y, dims=shape, signed=signed)
+        return unpacked_data.astype(dtype)
     return to_array(tensor)
 
 
@@ -729,6 +746,10 @@ def _run(self, *inputs, **kwargs):
                     ttype = TensorProto.FLOAT8E5M2FNUZ  # type: ignore[attr-defined]
                 elif t.dtype == bfloat16:
                     ttype = TensorProto.BLOFAT16  # type: ignore[attr-defined]
+                elif t.dtype == uint4:
+                    ttype = TensorProto.UINT4  # type: ignore[attr-defined]
+                elif t.dtype == int4:
+                    ttype = TensorProto.INT4  # type: ignore[attr-defined]
                 else:
                     raise e
             types.append(make_tensor_type_proto(ttype, t.shape))
diff --git a/onnx/reference/ops/op_cast.py b/onnx/reference/ops/op_cast.py
index 9b930cf0fb7..57ea3199d3b 100644
--- a/onnx/reference/ops/op_cast.py
+++ b/onnx/reference/ops/op_cast.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 
+from onnx import subbyte
 from onnx.helper import (
     float32_to_bfloat16,
     float32_to_float8e4m3,
@@ -23,6 +24,8 @@
     float8e4m3fnuz,
     float8e5m2,
     float8e5m2fnuz,
+    int4,
+    uint4,
 )
 from onnx.reference.op_run import OpRun
 
@@ -74,6 +77,25 @@ def cast_to(x, to, saturate):  # noqa: PLR0911
             y[i] = el
         return y.reshape(x.shape)
 
+    i4 = [
+        (uint4, "uint4", TensorProto.UINT4, False),
+        (int4, "int4", TensorProto.INT4, True),
+    ]
+    for np_type, np_desc, tensor_type, signed in i4:
+        if x.dtype == np_type and x.dtype.descr[0][0] == np_desc:
+            if to == tensor_type:
+                return x
+            to_type = tensor_dtype_to_np_dtype(to)
+            return x.astype(to_type)
+
+        if to == tensor_type:
+            xf = x.astype(np.float32).ravel()
+            y = np.empty(xf.shape, dtype=np_type).ravel()
+            for i in range(y.shape[0]):
+                el = subbyte.float32_to_4bit_unpacked(xf[i], signed=signed)
+                y[i] = el
+            return y.reshape(x.shape)
+
     f8back = {
         TensorProto.FLOAT8E4M3FN: (
             float8e4m3fn,
diff --git a/onnx/reference/ops/op_cast_like.py b/onnx/reference/ops/op_cast_like.py
index dc58319e197..4b7da13f786 100644
--- a/onnx/reference/ops/op_cast_like.py
+++ b/onnx/reference/ops/op_cast_like.py
@@ -13,6 +13,8 @@
     float8e4m3fnuz,
     float8e5m2,
     float8e5m2fnuz,
+    int4,
+    uint4,
 )
 
 
@@ -28,6 +30,10 @@ def _cast_like(x, y, saturate):
         to = TensorProto.FLOAT8E5M2
     elif y.dtype == float8e5m2fnuz and y.dtype.descr[0][0] == "e5m2fnuz":
         to = TensorProto.FLOAT8E5M2FNUZ
+    elif y.dtype == uint4 and y.dtype.descr[0][0] == "uint4":
+        to = TensorProto.UINT4
+    elif y.dtype == int4 and y.dtype.descr[0][0] == "int4":
+        to = TensorProto.INT4
     else:
         to = np_dtype_to_tensor_dtype(y.dtype)  # type: ignore
     return (cast_to(x, to, saturate),)
diff --git a/onnx/reference/ops/op_constant.py b/onnx/reference/ops/op_constant.py
index fe237159023..4cf9868898a 100644
--- a/onnx/reference/ops/op_constant.py
+++ b/onnx/reference/ops/op_constant.py
@@ -11,6 +11,8 @@
     float8e4m3fnuz,
     float8e5m2,
     float8e5m2fnuz,
+    int4,
+    uint4,
 )
 from onnx.reference.op_run import OpRun, RefAttrName
 
@@ -23,6 +25,8 @@ def _check_dtype(val):  # type: ignore
         float8e4m3fnuz,
         float8e5m2,
         float8e5m2fnuz,
+        uint4,
+        int4,
         np.int8,
         np.uint8,
         np.float16,
diff --git a/onnx/reference/ops/op_dequantize_linear.py b/onnx/reference/ops/op_dequantize_linear.py
index a3b6c7f698b..2be3a35b5b3 100644
--- a/onnx/reference/ops/op_dequantize_linear.py
+++ b/onnx/reference/ops/op_dequantize_linear.py
@@ -15,21 +15,30 @@
     float8e4m3fnuz,
     float8e5m2,
     float8e5m2fnuz,
+    int4,
+    uint4,
 )
 from onnx.reference.op_run import OpRun
 
 
 class DequantizeLinear(OpRun):
     def get_x_type(self, x: np.ndarray) -> int:
+        tensor_dtype = None
         if x.dtype == float8e4m3fn and x.dtype.descr[0][0] == "e4m3fn":
-            return TensorProto.FLOAT8E4M3FN
-        if x.dtype == float8e4m3fnuz and x.dtype.descr[0][0] == "e4m3fnuz":
-            return TensorProto.FLOAT8E4M3FNUZ
-        if x.dtype == float8e5m2 and x.dtype.descr[0][0] == "e5m2":
-            return TensorProto.FLOAT8E5M2
-        if x.dtype == float8e5m2fnuz and x.dtype.descr[0][0] == "e5m2fnuz":
-            return TensorProto.FLOAT8E5M2FNUZ
-        return np_dtype_to_tensor_dtype(x.dtype)
+            tensor_dtype = TensorProto.FLOAT8E4M3FN
+        elif x.dtype == float8e4m3fnuz and x.dtype.descr[0][0] == "e4m3fnuz":
+            tensor_dtype = TensorProto.FLOAT8E4M3FNUZ
+        elif x.dtype == float8e5m2 and x.dtype.descr[0][0] == "e5m2":
+            tensor_dtype = TensorProto.FLOAT8E5M2
+        elif x.dtype == float8e5m2fnuz and x.dtype.descr[0][0] == "e5m2fnuz":
+            tensor_dtype = TensorProto.FLOAT8E5M2FNUZ
+        elif x.dtype == uint4 and x.dtype.descr[0][0] == "uint4":
+            tensor_dtype = TensorProto.UINT4
+        elif x.dtype == int4 and x.dtype.descr[0][0] == "int4":
+            tensor_dtype = TensorProto.INT4
+        else:
+            tensor_dtype = np_dtype_to_tensor_dtype(x.dtype)
+        return tensor_dtype
 
     @staticmethod
     def reshape_input(
@@ -60,13 +69,13 @@ def _run(
             raise RuntimeError("Input 2 must be a vector or a number.")
 
         x_type = self.get_x_type(x)
-        f8_type = x_type in {
+        fp8_type = x_type in {
             TensorProto.FLOAT8E4M3FN,
             TensorProto.FLOAT8E4M3FNUZ,
             TensorProto.FLOAT8E5M2,
             TensorProto.FLOAT8E5M2FNUZ,
         }
-        if x_zero_point is not None and not f8_type:
+        if x_zero_point is not None and not fp8_type:
             zero_type = self.get_x_type(x_zero_point)
             if x_type != zero_type:
                 raise RuntimeError(
@@ -77,13 +86,13 @@ def _run(
                 x_zero_point, x.shape, axis
             )
         else:
-            if f8_type and x_zero_point is not None:
+            if fp8_type and x_zero_point is not None:
                 u_x_zero_point = x_zero_point.astype(np.uint8)
                 umi = u_x_zero_point.min()
                 uma = u_x_zero_point.max()
                 if umi != uma or umi != np.uint8(0):
                     raise RuntimeError(
-                        "x_zero_point is not null but should be zero for float 8 types."
+                        "x_zero_point is not null but should be zero for float8 types."
                     )
             if x_type == TensorProto.FLOAT8E4M3FN:
                 dx = float8e4m3_to_float32(x)
diff --git a/onnx/reference/ops/op_quantize_linear.py b/onnx/reference/ops/op_quantize_linear.py
index f7d33747cda..c8866ebb380 100644
--- a/onnx/reference/ops/op_quantize_linear.py
+++ b/onnx/reference/ops/op_quantize_linear.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-from onnx import TensorProto
+from onnx import TensorProto, subbyte
 from onnx.helper import (
     float32_to_float8e4m3,
     float32_to_float8e5m2,
@@ -21,6 +21,8 @@
     float8e4m3fnuz,
     float8e5m2,
     float8e5m2fnuz,
+    int4,
+    uint4,
 )
 from onnx.reference.op_run import OpRun
 
@@ -36,26 +38,33 @@ class _CommonQuantizeLinear(OpRun):
     }
 
     def get_zero_point_type(self, zero_point: np.ndarray) -> int:
+        zero_point_type = None
         if (
             zero_point.dtype == float8e4m3fn
             and zero_point.dtype.descr[0][0] == "e4m3fn"
         ):
-            return TensorProto.FLOAT8E4M3FN
-        if (
+            zero_point_type = TensorProto.FLOAT8E4M3FN
+        elif (
             zero_point.dtype == float8e4m3fnuz
             and zero_point.dtype.descr[0][0] == "e4m3fnuz"
         ):
-            return TensorProto.FLOAT8E4M3FNUZ
-        if zero_point.dtype == float8e5m2 and zero_point.dtype.descr[0][0] == "e5m2":
-            return TensorProto.FLOAT8E5M2
-        if (
+            zero_point_type = TensorProto.FLOAT8E4M3FNUZ
+        elif zero_point.dtype == float8e5m2 and zero_point.dtype.descr[0][0] == "e5m2":
+            zero_point_type = TensorProto.FLOAT8E5M2
+        elif (
             zero_point.dtype == float8e5m2fnuz
             and zero_point.dtype.descr[0][0] == "e5m2fnuz"
         ):
-            return TensorProto.FLOAT8E5M2FNUZ
-        return np_dtype_to_tensor_dtype(zero_point.dtype)
+            zero_point_type = TensorProto.FLOAT8E5M2FNUZ
+        elif zero_point.dtype == uint4 and zero_point.dtype.descr[0][0] == "uint4":
+            zero_point_type = TensorProto.UINT4
+        elif zero_point.dtype == int4 and zero_point.dtype.descr[0][0] == "int4":
+            zero_point_type = TensorProto.INT4
+        else:
+            zero_point_type = np_dtype_to_tensor_dtype(zero_point.dtype)
+        return zero_point_type
 
-    def common_run(
+    def common_run(  # noqa: PLR0911
         self,
         x: np.ndarray,
         y_scale: np.ndarray,
@@ -107,6 +116,20 @@ def common_run(
                 )
                 return (f8.astype(float8e5m2fnuz),)  # type: ignore[attr-defined]
 
+            if tensor_type in (TensorProto.UINT4, TensorProto.INT4):
+                xi = np.rint(x).astype(np.int32)
+                if len(y_scale.shape) > 0:
+                    xi += zero_point.reshape(new_shape)
+                else:
+                    xi += zero_point
+
+                single_func = lambda x: subbyte.float32_to_4bit_unpacked(  # noqa: E731
+                    x, signed=(tensor_type == TensorProto.INT4)
+                )
+                func = np.vectorize(single_func)
+                i4 = func(xi)
+                return (i4,)  # type: ignore[attr-defined]
+
             raise RuntimeError(
                 f"Unexpected tensor_type for input 2: tensor_type={tensor_type}, "
                 f"zero_point.dtype={zero_point.dtype}."
diff --git a/onnx/subbyte.py b/onnx/subbyte.py
new file mode 100644
index 00000000000..fc5ccb923b7
--- /dev/null
+++ b/onnx/subbyte.py
@@ -0,0 +1,72 @@
+# Copyright (c) ONNX Project Contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Tuple, Union
+
+import numpy as np
+
+INT4_MIN = -8
+INT4_MAX = 7
+UINT4_MIN = 0
+UINT4_MAX = 15
+
+
+def float32_to_4bit_unpacked(
+    x: Union[np.ndarray, np.dtype, float], signed: bool
+) -> np.ndarray:
+    """Cast to 4bit via rounding and clipping (without packing).
+
+    Args:
+        x: element to be converted
+        signed: boolean, whether to convert to signed int4.
+
+    Returns:
+        An ndarray with a single int4 element (sign-extended to int8/uint8)
+    """
+    dtype = np.int8 if signed else np.uint8
+    clip_low = INT4_MIN if signed else UINT4_MIN
+    clip_high = INT4_MAX if signed else UINT4_MAX
+    if not isinstance(x, np.ndarray):
+        x = np.asarray(x)
+
+    return np.rint(np.clip(x, clip_low, clip_high)).astype(dtype)  # type: ignore[no-any-return]
+
+
+def float32x2_to_4bitx2(
+    val_low: np.dtype, val_high: np.dtype, signed: bool
+) -> np.ndarray:
+    """Cast two elements to 4bit (via rounding and clipping) and pack
+    to a single byte
+    Args:
+        val_low: element to be packed in the 4 LSB
+        val_high: element to be packed in the 4 MSB
+        signed: boolean, whether to convert to signed int4.
+
+    Returns:
+        An ndarray with a single int8/uint8 element, containing both int4 elements
+    """
+    i8_high = float32_to_4bit_unpacked(val_high, signed)
+    i8_low = float32_to_4bit_unpacked(val_low, signed)
+    return i8_high << 4 | i8_low & 0x0F  # type: ignore[operator]
+
+
+def unpack_single_4bitx2(
+    x: Union[np.ndarray, np.dtype, float], signed: bool
+) -> Tuple[np.ndarray, np.ndarray]:
+    unpack_signed = lambda x: np.where((x >> 3) == 0, x, x | 0xF0)  # noqa: E731
+    """Unpack a single byte 4bitx2 to two 4 bit elements
+    Args:
+        x: Input data
+        signed: boolean, whether to interpret as signed int4.
+    Returns:
+        A tuple of ndarrays containing int4 elements (sign-extended to int8/uint8)
+    """
+    if not isinstance(x, np.ndarray):
+        x = np.asarray(x)
+    x_low = x & 0x0F
+    x_high = x >> 4
+    x_low = unpack_signed(x_low) if signed else x_low
+    x_high = unpack_signed(x_high) if signed else x_high
+    dtype = np.int8 if signed else np.uint8
+    return (x_low.astype(dtype), x_high.astype(dtype))
diff --git a/onnx/test/helper_test.py b/onnx/test/helper_test.py
index 0beee03f7bb..6ecd6b4ee5d 100644
--- a/onnx/test/helper_test.py
+++ b/onnx/test/helper_test.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
+import itertools
 import random
 import struct
 import unittest
@@ -25,6 +26,7 @@
     helper,
     numpy_helper,
 )
+from onnx.reference.op_run import to_array_extended
 
 
 class TestHelperAttributeFunctions(unittest.TestCase):
@@ -632,6 +634,47 @@ def test_make_float8e5m2fnuz_tensor_raw(self) -> None:
         ynp = numpy_helper.to_array(y)
         np.testing.assert_equal(expected, ynp)
 
+    @parameterized.parameterized.expand(
+        itertools.product(
+            (TensorProto.UINT4, TensorProto.INT4),
+            ((5, 4, 6), (4, 6, 5), (3, 3), (1,), (2**10,)),
+        )
+    )
+    def test_make_4bit_tensor(self, dtype, dims) -> None:
+        type_range = {
+            TensorProto.UINT4: (0, 15),
+            TensorProto.INT4: (-8, 7),
+        }
+        data = np.random.randint(
+            type_range[dtype][0], high=type_range[dtype][1] + 1, size=dims
+        )
+        y = helper.make_tensor("y", dtype, data.shape, data)
+        ynp = to_array_extended(y)
+        np.testing.assert_equal(data, ynp)
+
+    @parameterized.parameterized.expand(
+        itertools.product(
+            (TensorProto.UINT4, TensorProto.INT4), ((5, 4, 6), (4, 6, 5), (3, 3), (1,))
+        )
+    )
+    def test_make_4bit_raw_tensor(self, dtype, dims) -> None:
+        type_range = {
+            TensorProto.UINT4: (0, 15),
+            TensorProto.INT4: (-8, 7),
+        }
+        data = np.random.randint(
+            type_range[dtype][0], high=type_range[dtype][1] + 1, size=dims
+        )
+        packed_data = helper.pack_float32_to_4bit(
+            data, signed=(dtype == TensorProto.INT4)
+        )
+
+        y = helper.make_tensor(
+            "packed_int4", dtype, dims, packed_data.tobytes(), raw=True
+        )
+        ynp = numpy_helper.to_array(y)
+        np.testing.assert_equal(data, ynp)
+
     def test_make_sparse_tensor(self) -> None:
         values = [1.1, 2.2, 3.3, 4.4, 5.5]
         values_tensor = helper.make_tensor(
@@ -826,6 +869,8 @@ def test_unknown_dimensions(self) -> None:
             TensorProto.FLOAT8E4M3FNUZ,
             TensorProto.FLOAT8E5M2,
             TensorProto.FLOAT8E5M2FNUZ,
+            TensorProto.UINT4,
+            TensorProto.INT4,
             TensorProto.STRING,
             TensorProto.COMPLEX64,
             TensorProto.COMPLEX128,
@@ -856,6 +901,8 @@ def test_make_tensor_vals(tensor_dtype: int) -> None:
             TensorProto.FLOAT8E4M3FNUZ,
             TensorProto.FLOAT8E5M2,
             TensorProto.FLOAT8E5M2FNUZ,
+            TensorProto.UINT4,
+            TensorProto.INT4,
         }
     ],
     ids=lambda tensor_dtype: helper.tensor_dtype_to_string(tensor_dtype),
diff --git a/onnx/test/reference_evaluator_test.py b/onnx/test/reference_evaluator_test.py
index b3ab3607ae3..ee9b121a08a 100644
--- a/onnx/test/reference_evaluator_test.py
+++ b/onnx/test/reference_evaluator_test.py
@@ -26,7 +26,16 @@
 import version_utils
 from numpy.testing import assert_allclose
 
-from onnx import AttributeProto, FunctionProto, ModelProto, TensorProto, checker, parser
+import onnx.reference.custom_element_types as custom
+from onnx import (
+    AttributeProto,
+    FunctionProto,
+    ModelProto,
+    TensorProto,
+    checker,
+    parser,
+    subbyte,
+)
 from onnx.backend.test.case.node.roialign import get_roi_align_input_values
 from onnx.checker import check_model
 from onnx.defs import onnx_opset_version
@@ -5340,6 +5349,115 @@ def test_regex_invalid_pattern(self):
         with self.assertRaises(ValueError):
             ref.run(None, {"X": np.array(["x"])})
 
+    @parameterized.parameterized.expand(
+        [
+            (
+                TensorProto.UINT4,
+                [-1, 0, 1.5, 2, 3.3, 10, 20, 40],
+                [0, 0, 2, 2, 4, 10, 20, 30],
+            ),
+            (TensorProto.UINT4, [-1, 0, 1.5, 2, 3.3, 10, 40], [0, 0, 2, 2, 4, 10, 30]),
+            (TensorProto.UINT4, [0], [0]),
+            (
+                TensorProto.INT4,
+                [-20, -14.5, 0, 1.5, 2, 3.3, 10, 20],
+                [-16, -14, 0, 2, 2, 4, 10, 14],
+            ),
+            (
+                TensorProto.INT4,
+                [-20, -14.5, 0, 1.5, 2, 3.3, 10],
+                [-16, -14, 0, 2, 2, 4, 10],
+            ),
+            (TensorProto.INT4, [0], [0]),
+        ]
+    )
+    def test_quantize_linear_int4(self, qtype, data, expected):
+        X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
+        Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None])
+        model = make_model(
+            make_graph(
+                [
+                    make_node(
+                        "Constant",
+                        [],
+                        ["scale"],
+                        value=make_tensor("scale", TensorProto.FLOAT, [1], [2.0]),
+                    ),
+                    make_node(
+                        "Constant",
+                        [],
+                        ["zero"],
+                        value=make_tensor("zero", qtype, [1], [0]),
+                    ),
+                    make_node("QuantizeLinear", ["X", "scale", "zero"], ["T"]),
+                    make_node("DequantizeLinear", ["T", "scale"], ["Y"], axis=0),
+                ],
+                "g",
+                [X],
+                [Y],
+            )
+        )
+        ref = ReferenceEvaluator(model)
+        got = ref.run(None, {"X": data})
+        assert_allclose(expected, got[0])
+
+    @parameterized.parameterized.expand(
+        itertools.product(
+            (TensorProto.FLOAT, TensorProto.FLOAT16),
+            (TensorProto.UINT4, TensorProto.INT4),
+        )
+    )
+    def test_cast_int4_output(self, cast_from, cast_to):
+        X = make_tensor_value_info("X", cast_from, [None])
+        Y = make_tensor_value_info("Y", cast_to, [None])
+        model = make_model(
+            make_graph(
+                [
+                    make_node("Cast", ["X"], ["Y"], to=cast_to),
+                ],
+                "g",
+                [X],
+                [Y],
+            )
+        )
+        ref = ReferenceEvaluator(model)
+        data = np.array([0, 1, 2.4, 2.6, 4, 10], dtype=np.float32)
+        signed = cast_to == TensorProto.INT4
+        expected1 = np.array(
+            [subbyte.float32_to_4bit_unpacked(x, signed=signed) for x in data]
+        )
+        got = ref.run(None, {"X": data})
+        self.assertEqual(expected1.tolist(), got[0].tolist())
+
+    @parameterized.parameterized.expand(
+        itertools.product(
+            (TensorProto.UINT4, TensorProto.INT4),
+            (TensorProto.FLOAT, TensorProto.FLOAT16),
+        )
+    )
+    def test_cast_int4_input(self, cast_from, cast_to):
+        X = make_tensor_value_info("X", cast_from, [None])
+        Y = make_tensor_value_info("Y", cast_to, [None])
+        model = make_model(
+            make_graph(
+                [
+                    make_node("Cast", ["X"], ["Y"], to=TensorProto.FLOAT),
+                ],
+                "g",
+                [X],
+                [Y],
+            )
+        )
+        ref = ReferenceEvaluator(model)
+        data = np.array(range(0, 7), dtype=np.float32)
+        cast_from_np = custom.uint4 if cast_from == TensorProto.UINT4 else custom.int4
+        data = data.astype(cast_from_np)
+        expected1 = np.array(
+            [subbyte.float32_to_4bit_unpacked(x, cast_from_np) for x in data]
+        )
+        got = ref.run(None, {"X": data})
+        self.assertEqual(expected1.tolist(), got[0].tolist())
+
     def test_a_function_calling_a_function_once(self):
         X = make_tensor_value_info("X", TensorProto.FLOAT, ["N"])
         output = make_tensor_value_info("output", TensorProto.FLOAT, ["N"])
diff --git a/onnx/test/test_backend_onnxruntime.py b/onnx/test/test_backend_onnxruntime.py
index eb079c1e046..b7f2697776c 100644
--- a/onnx/test/test_backend_onnxruntime.py
+++ b/onnx/test/test_backend_onnxruntime.py
@@ -155,6 +155,35 @@ def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
         "|test_quantizelinear_axis"  # y_scale must be a scalar or 1D tensor of size 1.
         "|test_quantizelinear"  # No corresponding Numpy type for Tensor Type.
         "|test_affine_grid_"  # new IR version 9 and opset version 20 not supported yet.
+        "|test_quantizelinear_uint4"  # No corresponding Numpy type for Tensor Type.
+        "|test_quantizelinear_int4"  # No corresponding Numpy type for Tensor Type.
+        "|test_dequantizelinear_uint4"  # No corresponding Numpy type for Tensor Type.
+        "|test_dequantizelinear_int4"  # No corresponding Numpy type for Tensor Type.
+        "|test_cast_UINT4_to_FLOAT"  # No corresponding Numpy type for Tensor Type.
+        "|test_cast_INT4_to_FLOAT"  # No corresponding Numpy type for Tensor Type.
+        "|test_cast_UINT4_to_FLOAT16"  # No corresponding Numpy type for Tensor Type.
+        "|test_cast_INT4_to_FLOAT16"  # No corresponding Numpy type for Tensor Type.
+        ")"
+    )
+
+    # Exclude all tests that require IR10 until onnxruntime aligns
+    # TODO: Unwaive tests once onnxruntime supports Opset21/IR10 https://github.com/onnx/onnx/issues/5840
+    backend_test.exclude(
+        "("
+        "test_cast_"
+        "|test_castlike_"
+        "|test_constant"
+        "|test_edge_pad_cpu"
+        "|test_flatten_"
+        "|test_identity"
+        "|test_reflect_pad"
+        "|test_reshape_"
+        "|test_shape_"
+        "|test_size_"
+        "|test_squeeze_"
+        "|test_transpose_"
+        "|test_unsqueeze_"
+        "|test_wrap_pad_"
         ")"
     )
 
diff --git a/onnx/test/test_backend_reference.py b/onnx/test/test_backend_reference.py
index 29977e71ce1..7a78d79f268 100644
--- a/onnx/test/test_backend_reference.py
+++ b/onnx/test/test_backend_reference.py
@@ -120,12 +120,18 @@ def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
     "|test_cast_FLOAT16_to_FLOAT8"
     "|test_castlike_FLOAT_to_FLOAT8"
     "|test_castlike_FLOAT16_to_FLOAT8"
+    "|test_cast_FLOAT_to_UINT4"
+    "|test_cast_FLOAT16_to_UINT4"
+    "|test_cast_FLOAT_to_INT4"
+    "|test_cast_FLOAT16_to_INT4"
     "|test_cast_no_saturate_FLOAT_to_FLOAT8"
     "|test_cast_no_saturate_FLOAT16_to_FLOAT8"
     "|test_cast_BFLOAT16_to_FLOAT"
     "|test_castlike_BFLOAT16_to_FLOAT"
     "|test_quantizelinear_e4m3"
     "|test_quantizelinear_e5m2"
+    "|test_quantizelinear_uint4"
+    "|test_quantizelinear_int4"
     ")"
 )
 
diff --git a/onnx/version_converter/convert.h b/onnx/version_converter/convert.h
index 3bec454e2c2..1482bd909d9 100644
--- a/onnx/version_converter/convert.h
+++ b/onnx/version_converter/convert.h
@@ -608,24 +608,67 @@ class DefaultVersionConverter : public BaseVersionConverter {
         std::make_unique<TypeRestriction>("ReduceMin", OpSetID(20), OpSetID(19), reduce_min_max_18_unallowed_types));
 
     /******** 20 -> 21 ********/
+    registerAdapter(std::make_unique<CompatibleAdapter>("Cast", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("CastLike", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Constant", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("ConstantOfShape", OpSetID(20), OpSetID(21)));
     registerAdapter(std::make_unique<CompatibleAdapter>("DequantizeLinear", OpSetID(20), OpSetID(21)));
-    registerAdapter(std::make_unique<CompatibleAdapter>("QuantizeLinear", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Flatten", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Identity", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("If", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Loop", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Pad", OpSetID(20), OpSetID(21)));
     registerAdapter(std::make_unique<CompatibleAdapter>("QLinearMatMul", OpSetID(20), OpSetID(21)));
-
-    /******** 21 -> 20 ********/
+    registerAdapter(std::make_unique<CompatibleAdapter>("QuantizeLinear", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Reshape", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Scan", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Shape", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Size", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Squeeze", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Transpose", OpSetID(20), OpSetID(21)));
+    registerAdapter(std::make_unique<CompatibleAdapter>("Unsqueeze", OpSetID(20), OpSetID(21)));
 
     /******** 21 -> 20 ********/
     const std::vector<TensorProto_DataType> q_dq_20_unallowed_types = {
-        TensorProto_DataType_UINT16, TensorProto_DataType_INT16};
+        TensorProto_DataType_UINT16, TensorProto_DataType_INT16, TensorProto_DataType_UINT4, TensorProto_DataType_INT4};
     const std::vector<TensorProto_DataType> q_dqmm_20_unallowed_types = {
-        TensorProto_DataType_BFLOAT16, TensorProto_DataType_FLOAT16};
+        TensorProto_DataType_BFLOAT16,
+        TensorProto_DataType_FLOAT16,
+        TensorProto_DataType_UINT4,
+        TensorProto_DataType_INT4};
+    const std::vector<TensorProto_DataType> ir10_types_not_in_ir9 = {
+        TensorProto_DataType_UINT4, TensorProto_DataType_INT4};
+    const std::vector<TensorProto_DataType> ir10_types_not_in_ir4 = {
+        TensorProto_DataType_FLOAT8E4M3FN,
+        TensorProto_DataType_FLOAT8E4M3FNUZ,
+        TensorProto_DataType_FLOAT8E5M2,
+        TensorProto_DataType_FLOAT8E5M2FNUZ,
+        TensorProto_DataType_UINT4,
+        TensorProto_DataType_INT4};
 
+    registerAdapter(std::make_unique<TypeRestriction>("Cast", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("CastLike", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Constant", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(
+        std::make_unique<TypeRestriction>("ConstantOfShape", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
     registerAdapter(
         std::make_unique<TypeRestriction>("DequantizeLinear", OpSetID(21), OpSetID(20), q_dq_20_unallowed_types));
+    registerAdapter(std::make_unique<TypeRestriction>("Flatten", OpSetID(21), OpSetID(20), ir10_types_not_in_ir4));
+    registerAdapter(std::make_unique<TypeRestriction>("Identity", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("If", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Loop", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Pad", OpSetID(21), OpSetID(20), ir10_types_not_in_ir4));
     registerAdapter(
-        std::make_unique<TypeRestriction>("QuantizeLinear", OpSetID(21), OpSetID(20), q_dq_20_unallowed_types));
+        std::make_unique<TypeRestriction>("QLinearMatMul", OpSetID(21), OpSetID(20), q_dq_20_unallowed_types));
     registerAdapter(
-        std::make_unique<TypeRestriction>("QLinearMatMul", OpSetID(21), OpSetID(20), q_dqmm_20_unallowed_types));
+        std::make_unique<TypeRestriction>("QuantizeLinear", OpSetID(21), OpSetID(20), q_dq_20_unallowed_types));
+    registerAdapter(std::make_unique<TypeRestriction>("Reshape", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Scan", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Shape", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Size", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Squeeze", OpSetID(21), OpSetID(20), ir10_types_not_in_ir4));
+    registerAdapter(std::make_unique<TypeRestriction>("Transpose", OpSetID(21), OpSetID(20), ir10_types_not_in_ir9));
+    registerAdapter(std::make_unique<TypeRestriction>("Unsqueeze", OpSetID(21), OpSetID(20), ir10_types_not_in_ir4));
   }
 
   ModelProto convert_version(const ModelProto& mp_in, const OpSetID& initial_version, const OpSetID& target_version)