diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index 67952b0e039ef..47732e4af4b89 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -50,6 +50,7 @@ class NhwcTransformerImpl {
   void TransformQLinearActivation(Node& node);
   void TransformQLinearGlobalAveragePool(Node& node);
   void TransformSplit(Node& node);
+  void TransformPad(Node& node);
 
   Graph& graph_;
 
@@ -282,6 +283,49 @@ void NhwcTransformerImpl::TransformSplit(Node& node) {
   CreateNhwcArgument(node, node, nhwc_input->rank_);
 }
 
+void NhwcTransformerImpl::TransformPad(Node& node) {
+  auto& input_defs = node.MutableInputDefs();
+
+  auto* nhwc_input = LookupNhwcArgument(input_defs[0]);
+  if (nhwc_input == nullptr) {
+    return;
+  }
+
+  const ONNX_NAMESPACE::TensorProto* pads_tensor_proto = nullptr;
+  if (!graph_utils::NodeArgIsConstant(graph_, *input_defs[1]) ||
+      !graph_.GetInitializedTensor(input_defs[1]->Name(), pads_tensor_proto) ||
+      (pads_tensor_proto->dims_size() != 1) ||
+      (pads_tensor_proto->dims(0) != nhwc_input->rank_ * 2) ||
+      (nhwc_input->rank_ <= 2)) {  // nc only, no any hw axises
+    return;
+  }
+
+  // perm nchw to nhwc on pad tensor
+  Initializer pads_initializer{*pads_tensor_proto, graph_.ModelPath()};
+  const int64_t* nchw_pads_data = pads_initializer.data<int64_t>();
+  size_t n_dim = static_cast<size_t>(pads_tensor_proto->dims(0)) / 2;
+  std::vector<int64_t> nhwc_pads(nchw_pads_data, nchw_pads_data + pads_tensor_proto->dims(0));
+  std::copy_n(nchw_pads_data + 2, n_dim - 2, nhwc_pads.data() + 1);
+  std::copy_n(nchw_pads_data + 2 + n_dim, n_dim - 2, nhwc_pads.data() + 1 + n_dim);
+  nhwc_pads[n_dim - 1] = nchw_pads_data[1];
+  nhwc_pads[2 * n_dim - 1] = nchw_pads_data[n_dim + 1];
+
+  ONNX_NAMESPACE::TensorProto nhwc_pads_tensor_proto;
+  nhwc_pads_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  nhwc_pads_tensor_proto.set_name(graph_.GenerateNodeArgName("nhwc_permutated_pads"));
+  nhwc_pads_tensor_proto.set_raw_data(nhwc_pads.data(), n_dim * 2 * sizeof(int64_t));
+  nhwc_pads_tensor_proto.add_dims(n_dim * 2);
+  NodeArg* nhwc_pads_arg = &graph_utils::AddInitializer(graph_, nhwc_pads_tensor_proto);
+
+  // Update the node to directly use the NHWC inputs and decrement the original
+  // use counts of the NHWC inputs.
+  input_defs[1] = nhwc_pads_arg;
+  input_defs[0] = nhwc_input->nhwc_arg_;
+  nhwc_input->remaining_original_uses_--;
+
+  CreateNhwcArgument(node, node, nhwc_input->rank_);
+}
+
 void NhwcTransformerImpl::Transform(Node& node) {
   if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConv", {10})) {
     TransformQLinearConv(node);
@@ -295,6 +339,8 @@ void NhwcTransformerImpl::Transform(Node& node) {
     TransformQLinearGlobalAveragePool(node);
   } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Split", {2, 11, 13})) {
     TransformSplit(node);
+  } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {11, 13})) {
+    TransformPad(node);
   }
 }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index e12852a29641c..79db40e14692d 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -322,8 +322,15 @@ static Status PadImpl(OpKernelContext* ctx,
 
           int64_t prePad = reshaped_pad[inner_axis];
           int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
-          PadAxisConstant(axisStart - prePad, *axisStart, prePad);
-          PadAxisConstant(output, *(output - 1), postPad);
+          if (inner_no_pad_size == 1) {
+            PadAxisConstant(axisStart - prePad, *axisStart, prePad);
+            PadAxisConstant(output, *(output - 1), postPad);
+          } else {
+            // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
+            // Also general loop below after handling first pad axis with non-pad axis works fine.
+            PadAxis(axisStart - prePad, axisStart, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, pads[inner_axis]);
+            PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, pads[inner_axis + data_rank]);
+          }
           output += postPad;
           alignSkip = prePad;
         }
@@ -353,8 +360,14 @@ static Status PadImpl(OpKernelContext* ctx,
 
           int64_t prePad = reshaped_pad[inner_axis];
           int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
-          PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, prePad);
-          PadInnermostAxis(output, output - 2, -1 /* inputDelta */, postPad);
+          if (inner_no_pad_size == 1) {
+            PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, prePad);
+            PadInnermostAxis(output, output - 2, -1 /* inputDelta */, postPad);
+          } else {
+            // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode.
+            PadAxis(axisStart - prePad, axisStart + prePad, 1, -ptrdiff_t(inner_no_pad_size * 2), inner_no_pad_size, pads[inner_axis]);
+            PadAxis(output, output - 2 * inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size * 2), inner_no_pad_size, pads[inner_axis + data_rank]);
+          }
           output += postPad;
           alignSkip = prePad;
         }
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 24610903520ab..bdd4d44063a72 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -139,14 +139,14 @@ def get_intermediate_outputs(self, calib_mode='naive', providers=None, ort_graph
         '''
 
         #conduct inference session and get intermediate outputs
+        sess_options = onnxruntime.SessionOptions()
         if ort_graph_optimization_enable:
-            session = onnxruntime.InferenceSession(self.augmented_model_path, None) 
+            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
         else:            
-            sess_options = onnxruntime.SessionOptions()
-            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL  #ORT_ENABLE_BASIC
-            session = onnxruntime.InferenceSession(self.augmented_model_path,
-                                                   sess_options=sess_options,
-                                                   providers=providers)
+            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+        session = onnxruntime.InferenceSession(self.augmented_model_path,
+                                               sess_options=sess_options,
+                                               providers=providers)
 
         #number of outputs in original model
         num_model_outputs = len(self.model.graph.output)
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 48d729a9239d3..5f6946261a7df 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -15,7 +15,7 @@
 from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
 
 from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue, quantization_modes
-from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name
+from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name, quantize_nparray
 from .quant_utils import QuantType, onnx_domain, __producer__, __version__
 
 from .registry import CreateOpQuantizer, CreateDefaultOpQuantizer
@@ -48,11 +48,11 @@ def quantize_data(data, quantize_range, qType):
         scale = (float(max_range) * 2) / quantize_range if max_range > 0 else 1
         zero_point = 0
         # signed byte type
-        quantized_data = (np.asarray(data) / scale).round().astype('b')
+        quantized_data = quantize_nparray(QuantType.QInt8, np.asarray(data), scale, zero_point)
     elif qType == onnx_proto.TensorProto.UINT8:
         scale = (float(rmax) - rmin) / quantize_range if rmin != rmax else 1
         zero_point = round((0 - rmin) / scale)  # round to nearest integer
-        quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B')  # unsigned byte type
+        quantized_data = quantize_nparray(QuantType.QUInt8, np.asarray(data), scale, zero_point)
     else:
         raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))
 
@@ -500,7 +500,7 @@ def _get_quantization_params(self, param_name):
 
         return True, scale_name, zero_point_name, scale_shape, zero_point_shape
 
-    def _get_quantize_input_nodes(self, node, input_index, qType):
+    def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name = None, given_zp_name = None):
         '''
         Given an input for a node (which is not a initializer), this function
             - add nodes to compute zero point and scale for this input if they don't exist.
@@ -508,13 +508,17 @@ def _get_quantize_input_nodes(self, node, input_index, qType):
             parameter node: node being quantized in NodeProto format.
             parameter input_index: index of input in node.input.
             parameter qType: type to quantize to.
+            parameter given_scale_name: if those inputs need to be quanitzed using this scale tensor.
+            parameter given_zp_name: if those inputs to be quantized using this zeropoint tensor.
             return: List of newly created nodes in NodeProto format.
         '''
         input_name = node.input[input_index]
         output_name = input_name + "_quantized"
 
-        data_found, scale_name, zp_name, _, _ = \
-            self._get_quantization_params(input_name)
+        if (given_scale_name is not None) and (given_zp_name is not None):
+            data_found, scale_name, zp_name = (True, given_scale_name, given_zp_name)
+        else:
+            data_found, scale_name, zp_name, _, _ = self._get_quantization_params(input_name)
 
         if self.static:
             if data_found == False:
diff --git a/onnxruntime/python/tools/quantization/operators/pad.py b/onnxruntime/python/tools/quantization/operators/pad.py
new file mode 100644
index 0000000000000..d6ca1392c8327
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/operators/pad.py
@@ -0,0 +1,68 @@
+import onnx
+import numpy as np
+from .base_operator import QuantOperatorBase
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
+
+
+class QPad(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert (node.op_type == "Pad")
+
+        # Only after version 11, it has the optional constant_value
+        # If input[0] is not quantized, do not quanitize this node
+        if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
+            super().quantize()
+            return
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            kwargs.update(kv)
+
+        if 'mode' not in kwargs or kwargs['mode'] == b'constant':
+            if len(node.input) > 2:  # There is 3rd input 'constant_value'
+                zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
+                scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
+                if zp_tensor is None or scale_tensor is None:
+                    super().quantize()
+                    return
+
+                padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
+                if padding_constant_initializer is not None:
+                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
+                    zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
+                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
+                    scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
+                    padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
+                    quantized_padding_constant_array = quantize_nparray(
+                        quantized_input_value.qType, padding_constant_array, scale_value, zp_value)
+                    quantized_padding_constant_name = node.input[2] + "_quantized"
+                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
+                        quantized_padding_constant_array, quantized_padding_constant_name)
+                    # Suppose this padding constant initializer only used by the node
+                    self.quantizer.model.remove_initializer(padding_constant_initializer)
+                    self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
+                    node.input[2] = quantized_padding_constant_name
+                else:
+                    pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
+                        node, 2, quantized_input_value.qType,
+                        quantized_input_value.scale_name, quantized_input_value.zp_name)
+                    self.quantizer.new_nodes += [pad_value_qnodes]
+                    node.input[2] = pad_value_qnodes.output[0]
+            else:
+                node.input.extend([quantized_input_value.zp_name])  # pad zero_point for original zero
+
+        # Create an entry for output quantized value
+        quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
+                                                quantized_input_value.scale_name, quantized_input_value.zp_name,
+                                                QuantizedValueType.Input)
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        node.input[0] = quantized_input_value.q_name
+        node.output[0] = quantized_output_value.q_name
+        self.quantizer.new_nodes += [node]
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 2b3e430492a4f..c4ed4a1a6ced6 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -60,6 +60,15 @@ class QuantType(Enum):
 }
 
 
+def quantize_nparray(qtype, arr, scale, zero_point, low = None, high = None):
+    dtype = QUANT_TYPE_TO_NP_TYPE[qtype]
+    cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
+    cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
+    arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
+    numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
+    return arr_fp32.astype(dtype)
+
+
 class QuantizedInitializer:
     '''
         Represents a linearly quantized weight input from ONNX operators
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 0ab33442d2f5e..92b8ace220faf 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -35,7 +35,7 @@ def optimize_model(model_path: Path):
     sess_option = SessionOptions()
     sess_option.optimized_model_filepath = opt_model_path.as_posix()
     sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
-    _ = InferenceSession(model_path.as_posix(), sess_option)
+    _ = InferenceSession(model_path.as_posix(), sess_option, providers=['CPUExecutionProvider'])
     optimized_model = onnx.load(opt_model_path.as_posix())
     return optimized_model
 
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index 1503c69faa97c..16ce66467f06a 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -11,6 +11,7 @@
 from .operators.gavgpool import QGlobalAveragePool
 from .operators.lstm import LSTMQuant
 from .operators.split import QSplit
+from .operators.pad import QPad
 
 CommonOpsRegistry = {"Gather": GatherQuant, "EmbedLayerNormalization": EmbedLayerNormalizationQuant}
 
@@ -33,7 +34,8 @@
     "Sigmoid": QLinearActivation,
     "MaxPool": QMaxPool,
     "GlobalAveragePool": QGlobalAveragePool,
-    "Split": QSplit,
+    "Split" : QSplit,
+    "Pad" : QPad,
 }
 QLinearOpsRegistry.update(CommonOpsRegistry)
 
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index 8960a526d7112..60d5eebb61d10 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -455,6 +455,40 @@ TEST(NhwcTransformerTests, ConvSplit) {
   }
 }
 
+TEST(NhwcTransformerTests, ConvPad) {
+  std::vector<std::string> pad_modes = {"constant", "reflect", "edge"};
+  for (const auto& mode : pad_modes) {
+    auto build_test_case = [&](NhwcTestHelper& helper) {
+      auto* input_arg = helper.MakeInput<uint8_t>({1, 23, 13, 13});
+      auto* conv1_output_arg = helper.MakeIntermediate();
+      auto* pads_const = helper.MakeScalarInitializer<uint8_t>(131);
+      auto* pads_arg = helper.Make1DInitializer<int64_t>({0, 0, 1, 2, 0, 0, 3, 4});
+      auto* pad_output_arg = helper.MakeIntermediate();
+      auto* conv2_output_arg = helper.MakeIntermediate();
+      auto* output_arg = helper.MakeOutput();
+
+      Node& conv1_node = helper.AddQLinearConvNode<uint8_t>(input_arg, .01f, 135,
+                                                            {30, 23, 3, 3}, .02f, 126,
+                                                            conv1_output_arg, .37f, 131);
+      conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+      Node& pad_node = helper.AddNode("Pad", {conv1_output_arg, pads_arg, pads_const}, {pad_output_arg});
+      pad_node.AddAttribute("mode", mode);
+      helper.AddQLinearConvNode<uint8_t>(pad_output_arg, .37f, 131,
+                                         {16, 30, 3, 3}, .015f, 129,
+                                         conv2_output_arg, .37f, 131);
+      helper.AddDequantizeLinearNode(conv2_output_arg, .37f, 131, output_arg);
+    };
+
+    auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2);
+      EXPECT_EQ(op_to_count["Transpose"], 2);
+    };
+
+    NhwcTransformerTester(build_test_case, check_nhwc_graph);
+  }
+}
+
 TEST(NhwcTransformerTests, ConvBlockActivation) {
   auto test_case = [&](uint32_t extra_edges) {
     auto build_test_case = [&](NhwcTestHelper& helper) {
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 86ea66ed84f71..d18dbe7095c1d 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -360,6 +360,233 @@ TYPED_TEST(PadOpTest, Pad_Reflect_2D) {
                                   "reflect");
 }
 
+TYPED_TEST(PadOpTest, Pad_Constant_3D_Inner_No_Padding) {
+  using T = TypeParam;
+  RunAllOpsetAllDomainPadTests<T>({3, 2, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  {1, 1, 0, 1, 1, 0},
+                                  T(31),
+                                  {5, 4, 5},
+                                  {T(31), T(31), T(31), T(31), T(31),
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(31), T(31), T(31), T(31), T(31),
+
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(31), T(31), T(31), T(31), T(31),
+
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(31), T(31), T(31), T(31), T(31),
+
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(31), T(31), T(31), T(31), T(31),
+
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(31), T(31), T(31), T(31), T(31),
+                                   T(31), T(31), T(31), T(31), T(31)},
+                                  "constant");
+}
+
+TYPED_TEST(PadOpTest, Pad_Edge_3D_Inner_No_Padding) {
+  using T = TypeParam;
+  RunAllOpsetAllDomainPadTests<T>({3, 2, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  {1, 1, 0, 1, 1, 0},
+                                  T(0),
+                                  {5, 4, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(6), T(7), T(8), T(9), T(10),
+
+                                   T(1), T(2), T(3), T(4), T(5),
+                                   T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(6), T(7), T(8), T(9), T(10),
+
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(16), T(17), T(18), T(19), T(20),
+
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(26), T(27), T(28), T(29), T(30),
+
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  "edge");
+}
+
+TYPED_TEST(PadOpTest, Pad_Edge_3D_Last_Pad_Slice_Inner_No_Padding) {
+  using T = TypeParam;
+  RunAllOpsetAllDomainPadTests<T>({3, 2, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  {1, -1, 0, 1, 1, 0},
+                                  T(0),
+                                  {5, 2, 5},
+                                  {T(6), T(7), T(8), T(9), T(10),
+                                   T(6), T(7), T(8), T(9), T(10),
+
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(6), T(7), T(8), T(9), T(10),
+
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(16), T(17), T(18), T(19), T(20),
+
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(26), T(27), T(28), T(29), T(30),
+
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  "edge");
+}
+
+TYPED_TEST(PadOpTest, Pad_Edge_3D_Last_Slice_Inner_No_Padding) {
+  using T = TypeParam;
+  RunAllOpsetAllDomainPadTests<T>({2, 3, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  {1, -1, 0, 1, 0, 0},
+                                  T(0),
+                                  {4, 2, 5},
+                                  {T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  "edge");
+}
+
+TYPED_TEST(PadOpTest, Pad_Reflect_3D_Inner_No_Padding) {
+  using T = TypeParam;
+  RunAllOpsetAllDomainPadTests<T>({3, 2, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  {1, 1, 0, 1, 1, 0},
+                                  T(0),
+                                  {5, 4, 5},
+                                  {T(16), T(17), T(18), T(19), T(20),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(11), T(12), T(13), T(14), T(15),
+
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(1), T(2), T(3), T(4), T(5),
+
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(11), T(12), T(13), T(14), T(15),
+
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(21), T(22), T(23), T(24), T(25),
+
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(11), T(12), T(13), T(14), T(15)},
+                                  "reflect");
+}
+
+TYPED_TEST(PadOpTest, Pad_Reflect_3D_Last_Pad_Slice_Inner_No_Padding) {
+  using T = TypeParam;
+  RunAllOpsetAllDomainPadTests<T>({2, 3, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  {1, -1, 0, 1, 1, 0},
+                                  T(0),
+                                  {4, 3, 5},
+                                  {T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(21), T(22), T(23), T(24), T(25),
+
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(6), T(7), T(8), T(9), T(10),
+
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+                                   T(21), T(22), T(23), T(24), T(25),
+
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(6), T(7), T(8), T(9), T(10)},
+                                  "reflect");
+}
+
+TYPED_TEST(PadOpTest, Pad_Reflect_3D_Last_Slice_Inner_No_Padding) {
+  using T = TypeParam;
+  RunAllOpsetAllDomainPadTests<T>({2, 3, 5},
+                                  {T(1), T(2), T(3), T(4), T(5),
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+                                   T(16), T(17), T(18), T(19), T(20),
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30)},
+                                  {1, -1, 0, 1, 0, 0},
+                                  T(0),
+                                  {4, 2, 5},
+                                  {T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15),
+
+                                   T(21), T(22), T(23), T(24), T(25),
+                                   T(26), T(27), T(28), T(29), T(30),
+
+                                   T(6), T(7), T(8), T(9), T(10),
+                                   T(11), T(12), T(13), T(14), T(15)},
+                                  "reflect");
+}
 
 /*
 Example numpy for testing behavior
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
new file mode 100644
index 0000000000000..31c2a32718592
--- /dev/null
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -0,0 +1,46 @@
+
+import onnx
+import numpy as np
+import onnxruntime
+from pathlib import Path
+from onnxruntime.quantization import CalibrationDataReader
+
+
+class TestDataFeeds(CalibrationDataReader):
+    def __init__(self, data_feeds):
+        '''
+        parameter data_feeds: list of input feed, each input feed is diction of {input_name: np_array}
+        '''
+        self.data_feeds = data_feeds
+        self.iter_next = iter(self.data_feeds)
+
+    def get_next(self):
+        return next(self.iter_next, None)
+
+    def rewind(self):
+        self.iter_next = iter(self.data_feeds)
+
+
+def check_op_type_count(testcase, model_path, **kwargs):
+    model = onnx.load(Path(model_path))
+    optype2count = {}
+    for op_type in kwargs:
+        optype2count[op_type] = 0
+    for node in model.graph.node:
+        if node.op_type in optype2count:
+            optype2count[node.op_type] += 1
+    for op_type in kwargs:
+        testcase.assertEqual(kwargs[op_type], optype2count[op_type], 'op_type {} count not same'.format(op_type))
+
+
+def check_model_correctness(testcase, model_path_origin, model_path_to_check, inputs, rtol=1e-2, atol=0.05):
+    sess_options = onnxruntime.SessionOptions()
+    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+    origin_sess = onnxruntime.InferenceSession(model_path_origin, sess_options=sess_options, providers=["CPUExecutionProvider"])
+    origin_results = origin_sess.run([], inputs)
+    target_sess = onnxruntime.InferenceSession(model_path_to_check, sess_options=sess_options,providers=["CPUExecutionProvider"])
+    target_results = target_sess.run([], inputs)
+    testcase.assertEqual(len(origin_results), len(target_results), 'result count are different')
+    for idx, ref_output in enumerate(origin_results):
+        output = target_results[idx]
+        np.testing.assert_allclose(ref_output, output, rtol=rtol, atol=atol)
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
new file mode 100644
index 0000000000000..3bbc8fc38f272
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+# coding: utf-8
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import unittest
+import onnx
+import numpy as np
+from onnx import helper, TensorProto
+from onnxruntime.quantization import quantize_static, quantize_dynamic
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+
+
+class TestOpQuatizerPad(unittest.TestCase):
+    def input_feeds(self, n, name2shape):
+        input_data_list = []
+        for i in range(n):
+            inputs = {}
+            for name, shape in name2shape.items():
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
+            input_data_list.extend([inputs])
+        dr = TestDataFeeds(input_data_list)
+        return dr
+
+    def construct_model_pad(self, output_model_path, pad_mode, pad_input_shape, pad_dims, constant_value=None):
+        #    (input)
+        #      |
+        #     Pad
+        #      |
+        #    (output)
+        rank = len(pad_input_shape)
+        self.assertEqual(rank * 2, len(pad_dims))
+
+        input_tensor = helper.make_tensor_value_info('input', TensorProto.FLOAT, pad_input_shape)
+        pad_dims_initializer = helper.make_tensor('pad_dims', TensorProto.INT64, [2 * rank], pad_dims)
+        output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:]))]
+        output_tensor = helper.make_tensor_value_info('output', TensorProto.FLOAT, output_shape)
+
+        inputs = ['input', 'pad_dims']
+        initializers = [pad_dims_initializer]
+        if (constant_value is not None) and (pad_mode is None or pad_mode == 'constant'):
+            constant_value_tensor = helper.make_tensor('padding_value', TensorProto.FLOAT, [], [constant_value])
+            inputs.extend(['padding_value'])
+            initializers.extend([constant_value_tensor])
+        kwargs = {'mode': pad_mode} if pad_mode is not None else {}
+        pad_node = helper.make_node('Pad', inputs, ['output'], name='PadNode', **kwargs)
+
+        graph = helper.make_graph([pad_node], 'TestOpQuantizerPad_test_model',
+                                  [input_tensor], [output_tensor], initializer=initializers)
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model.ir_version = onnx.IR_VERSION
+
+        onnx.save(model, output_model_path)
+
+    def construct_model_conv_pad(self, output_model_path, conv_input_shape, conv_weight_shape,
+                                 pad_input_shape, pad_mode, pad_dims, constant_value=None):
+        #      (input)
+        #          \
+        #         Conv
+        #        /    \
+        #   Identity   Pad
+        #    /            \
+        # (identity_out)  (output)
+        rank = len(pad_input_shape)
+        self.assertEqual(rank * 2, len(pad_dims))
+
+        input_tensor = helper.make_tensor_value_info('input', TensorProto.FLOAT, conv_input_shape)
+
+        conv_weight_arr = np.random.randint(-1, 2, conv_weight_shape).astype(np.float32)
+        conv_weight_initializer = onnx.numpy_helper.from_array(conv_weight_arr, name='conv1_weight')
+        conv_node = onnx.helper.make_node('Conv', ['input', 'conv1_weight'], ['conv_output'], name='conv_node')
+
+        identity_out = helper.make_tensor_value_info('identity_out', TensorProto.FLOAT, pad_input_shape)
+        identity_node = helper.make_node('Identity', ['conv_output'], ['identity_out'], name='IdentityNode')
+
+        pad_dims_initializer = helper.make_tensor('pad_dims', TensorProto.INT64, [2 * rank], pad_dims)
+        output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:]))]
+        output_tensor = helper.make_tensor_value_info('output', TensorProto.FLOAT, output_shape)
+        pad_inputs = ['conv_output', 'pad_dims']
+        initializers = [conv_weight_initializer, pad_dims_initializer]
+        if (constant_value is not None) and (pad_mode is None or pad_mode == 'constant'):
+            constant_value_tensor = helper.make_tensor('padding_value', TensorProto.FLOAT, [], [constant_value])
+            pad_inputs.extend(['padding_value'])
+            initializers.extend([constant_value_tensor])
+        kwargs = {'mode': pad_mode} if pad_mode is not None else {}
+        pad_node = helper.make_node('Pad', pad_inputs, ['output'], name='pad_node', **kwargs)
+
+        graph = helper.make_graph([conv_node, identity_node, pad_node], 'TestOpQuantizerPad_test_model',
+                                  [input_tensor], [identity_out, output_tensor], initializer=initializers)
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model.ir_version = onnx.IR_VERSION
+        onnx.save(model, output_model_path)
+
+    def quantize_mode(self, model_fp32_path, model_i8_path, data_reader=None):
+        if data_reader is not None:
+            quantize_static(model_fp32_path, model_i8_path, data_reader, reduce_range=True)
+        else:
+            quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True)
+
+    def verify_should_not_trigger(self, quantize_mode='static'):
+        np.random.seed(108)
+        model_fp32_path = 'qop_pad_notrigger_fp32_{}.onnx'.format(quantize_mode)
+        model_i8_path = 'qop_pad_notrigger_i8_{}.onnx'.format(quantize_mode)
+        data_reader = self.input_feeds(1, {'input': [1, 16, 31, 31]})
+        self.construct_model_pad(model_fp32_path, 'constant', [1, 16, 31, 31], [0, 0, 1, 2, 0, 0, 3, 4])
+        self.quantize_mode(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader)
+        data_reader.rewind()
+        # DequantizeLinear=0 pad node is not been quantized as input is not quantized.
+        check_op_type_count(self, model_i8_path, DynamicQuantizeLinear=0, QuantizeLinear=0, DequantizeLinear=0)
+        check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next())
+
+    def test_static_quantize_no_trigger(self):
+        self.verify_should_not_trigger(quantize_mode='static')
+
+    def test_dynamic_quantize_no_trigger(self):
+        self.verify_should_not_trigger(quantize_mode='dynamic')
+
+    def verify_quantize_with_pad_mode(self, pad_mode, constant_value=None, quantize_mode='static'):
+        np.random.seed(108)
+        tag_pad_mode = pad_mode if pad_mode is not None else 'none'
+        tag_constant_value = '' if constant_value is None else '_value'
+        model_fp32_path = 'qop_pad_{}_fp32_{}{}.onnx'.format(quantize_mode, tag_pad_mode, tag_constant_value)
+        model_i8_path = 'qop_pad_{}_i8_{}{}.onnx'.format(quantize_mode, tag_pad_mode, tag_constant_value)
+        data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]})
+        self.construct_model_conv_pad(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31],
+                                      pad_mode, [0, 0, 1, 2, 0, 0, 3, 4], constant_value=constant_value)
+        self.quantize_mode(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader)
+        data_reader.rewind()
+        # DequantizeLinear=2 means there are one DequantizeLinear Node aftr both conv and pad,
+        # which means pad node is running in quantized semantic.
+        # In dynamic quantize mode, pad operator in fact not quantized as input is fp32.
+        kwargs = {'DynamicQuantizeLinear': 1} if quantize_mode != 'static' else {'DequantizeLinear': 2, 'QuantizeLinear': 1}
+        check_op_type_count(self, model_i8_path, **kwargs)
+        check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next())
+
+    def test_static_mode_edge(self):
+        self.verify_quantize_with_pad_mode('edge', constant_value=None)
+
+    def test_static_mode_reflect(self):
+        self.verify_quantize_with_pad_mode('reflect', constant_value=None)
+
+    def test_static_mode_constant_default(self):
+        self.verify_quantize_with_pad_mode('constant', constant_value=None)
+
+    def test_static_mode_constant_value(self):
+        self.verify_quantize_with_pad_mode('constant', constant_value=3.75)
+
+    def test_dynamic_mode_edge(self):
+        self.verify_quantize_with_pad_mode('edge', constant_value=None, quantize_mode='dynamic')
+
+    def test_dynamic_mode_reflect(self):
+        self.verify_quantize_with_pad_mode('reflect', constant_value=None, quantize_mode='dynamic')
+
+    def test_dynamic_mode_constant_default(self):
+        self.verify_quantize_with_pad_mode('constant', constant_value=None, quantize_mode='dynamic')
+
+    def test_dynamic_mode_constant_value(self):
+        self.verify_quantize_with_pad_mode('constant', constant_value=3.75, quantize_mode='dynamic')
+
+
+if __name__ == '__main__':
+    unittest.main()