diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index b99f77fdc8b30..b5f359293b865 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1844,9 +1844,42 @@ bool arith::IndexCastOp::areCastCompatible(TypeRange inputs,
   return areIndexCastCompatible(inputs, outputs);
 }
 
+static unsigned getBitwidth(Type type, unsigned indexBitwidth) {
+  Type elemType = getElementTypeOrSelf(type);
+  if (isa<IndexType>(elemType))
+    return indexBitwidth;
+  return elemType.getIntOrFloatBitWidth();
+}
+
+template <typename CastOp>
+struct InferExactOnIndexCast final : OpRewritePattern<CastOp> {
+  InferExactOnIndexCast(MLIRContext *context)
+      : OpRewritePattern<CastOp>(context) {}
+
+  LogicalResult matchAndRewrite(CastOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getExact())
+      return failure();
+
+    DataLayout layout = DataLayout::closest(op);
+    unsigned indexBitwidth =
+        layout.getTypeSizeInBits(IndexType::get(op.getContext()));
+    unsigned srcBW = getBitwidth(op.getIn().getType(), indexBitwidth);
+    unsigned dstBW = getBitwidth(op.getType(), indexBitwidth);
+    if (srcBW > dstBW)
+      return rewriter.notifyMatchFailure(op, "source is wider than dest");
+
+    rewriter.modifyOpInPlace(op, [&] { op.setExact(true); });
+    return success();
+  }
+};
+
 OpFoldResult arith::IndexCastOp::fold(FoldAdaptor adaptor) {
   // index_cast(constant) -> constant
-  unsigned resultBitwidth = 64; // Default for index integer attributes.
+  DataLayout layout = DataLayout::closest(*this);
+  // Sane defaults for index integer attributes.
+  unsigned resultBitwidth =
+      layout.getTypeSizeInBits(IndexType::get(this->getContext()));
   if (auto intTy = dyn_cast<IntegerType>(getElementTypeOrSelf(getType())))
     resultBitwidth = intTy.getWidth();
 
@@ -1859,7 +1892,8 @@ OpFoldResult arith::IndexCastOp::fold(FoldAdaptor adaptor) {
 
 void arith::IndexCastOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<IndexCastOfIndexCast, IndexCastOfExtSI>(context);
+  patterns.add<IndexCastOfIndexCast, IndexCastOfExtSI,
+               InferExactOnIndexCast<IndexCastOp>>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1873,7 +1907,10 @@ bool arith::IndexCastUIOp::areCastCompatible(TypeRange inputs,
 
 OpFoldResult arith::IndexCastUIOp::fold(FoldAdaptor adaptor) {
   // index_castui(constant) -> constant
-  unsigned resultBitwidth = 64; // Default for index integer attributes.
+  DataLayout layout = DataLayout::closest(*this);
+  // Sane defaults for index integer attributes.
+  unsigned resultBitwidth =
+      layout.getTypeSizeInBits(IndexType::get(this->getContext()));
   if (auto intTy = dyn_cast<IntegerType>(getElementTypeOrSelf(getType())))
     resultBitwidth = intTy.getWidth();
 
@@ -1886,7 +1923,8 @@ OpFoldResult arith::IndexCastUIOp::fold(FoldAdaptor adaptor) {
 
 void arith::IndexCastUIOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<IndexCastUIOfIndexCastUI, IndexCastUIOfExtUI>(context);
+  patterns.add<IndexCastUIOfIndexCastUI, IndexCastUIOfExtUI,
+               InferExactOnIndexCast<IndexCastUIOp>>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
index fd8910265cd89..58a1d91cf0cad 100644
--- a/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
+++ b/mlir/test/Conversion/ArmSMEToLLVM/arm-sme-to-llvm.mlir
@@ -588,7 +588,7 @@ func.func @arm_sme_extract_tile_slice_ver_i128(%tile_slice_index : index) -> vec
 // CHECK-LABEL: @arm_sme_streaming_vl_bytes
 // CHECK: %[[CONST:.*]] = arith.constant 8 : index
 // CHECK: %[[CNTSD:.*]] = "arm_sme.intr.cntsd"() : () -> i64
-// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] : i64 to index
+// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] exact : i64 to index
 // CHECK: %[[MUL:.*]] = arith.muli %[[CNTSD_IDX]], %[[CONST]] : index
 func.func @arm_sme_streaming_vl_bytes() -> index {
   %svl_b = arm_sme.streaming_vl <byte>
@@ -600,7 +600,7 @@ func.func @arm_sme_streaming_vl_bytes() -> index {
 // CHECK-LABEL: @arm_sme_streaming_vl_half_words
 // CHECK: %[[CONST:.*]] = arith.constant 4 : index
 // CHECK: %[[CNTSD:.*]] = "arm_sme.intr.cntsd"() : () -> i64
-// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] : i64 to index
+// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] exact : i64 to index
 // CHECK: %[[MUL:.*]] = arith.muli %[[CNTSD_IDX]], %[[CONST]] : index
 func.func @arm_sme_streaming_vl_half_words() -> index {
   %svl_h = arm_sme.streaming_vl <half>
@@ -612,7 +612,7 @@ func.func @arm_sme_streaming_vl_half_words() -> index {
 // CHECK-LABEL: @arm_sme_streaming_vl_words
 // CHECK: %[[CONST:.*]] = arith.constant 2 : index
 // CHECK: %[[CNTSD:.*]] = "arm_sme.intr.cntsd"() : () -> i64
-// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] : i64 to index
+// CHECK: %[[CNTSD_IDX:.*]] = arith.index_cast %[[CNTSD]] exact : i64 to index
 // CHECK: %[[MUL:.*]] = arith.muli %[[CNTSD_IDX]], %[[CONST]] : index
 func.func @arm_sme_streaming_vl_words() -> index {
   %svl_w = arm_sme.streaming_vl <word>
diff --git a/mlir/test/Conversion/ShardToMPI/convert-shard-to-mpi.mlir b/mlir/test/Conversion/ShardToMPI/convert-shard-to-mpi.mlir
index 08c3897e4e650..3fa9b472f670a 100644
--- a/mlir/test/Conversion/ShardToMPI/convert-shard-to-mpi.mlir
+++ b/mlir/test/Conversion/ShardToMPI/convert-shard-to-mpi.mlir
@@ -25,7 +25,7 @@ func.func @process_multi_index_reorder() -> (index, index) {
 // CHECK-LABEL: func @process_linear_index
 func.func @process_linear_index() -> index {
   // CHECK: %[[RES:.*]], %[[rank:.*]] = mpi.comm_rank
-  // CHECK: %[[cast:.*]] = arith.index_cast %[[rank]] : i32 to index
+  // CHECK: %[[cast:.*]] = arith.index_cast %[[rank]] exact : i32 to index
   %0 = shard.process_linear_index on @grid0 : index
   // CHECK: return %[[cast]] : index
   return %0 : index
@@ -97,7 +97,7 @@ module {
   func.func @all_slice(%arg0 : tensor<3x5xf32>) -> tensor<3x1xf32> {
     // CHECK: [[v0:%.*]] = mpi.comm_world : !mpi.comm
     // CHECK: [[vretval:%.*]], [[vrank:%.*]] = mpi.comm_rank([[v0]]) : !mpi.retval, i32
-    // CHECK: [[v1:%.*]] = arith.index_cast [[vrank]] : i32 to index
+    // CHECK: [[v1:%.*]] = arith.index_cast [[vrank]] exact : i32 to index
     // CHECK: [[v2:%.*]]:3 = affine.delinearize_index [[v1]] into (3, 4, 5) : index, index, index
     // CHECK: [[vextracted_slice:%.*]] = tensor.extract_slice
     // CHECK-SAME: [0, [[v2]]#2] [3, 1] [1, 1] : tensor<3x5xf32> to tensor<3x1xf32>
@@ -203,7 +203,7 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 7> } {
     // CHECK: [[v1:%.*]] = mpi.comm_world : !mpi.comm
     // CHECK: [[vnewcomm:%.*]] = mpi.comm_split([[v1]], [[vc2_i32]], [[vc1_i32]]) : !mpi.comm
     // CHECK: [[vsize:%.*]] = mpi.comm_size([[vnewcomm]]) : i32
-    // CHECK: [[v2:%.*]] = arith.index_cast [[vsize]] : i32 to index
+    // CHECK: [[v2:%.*]] = arith.index_cast [[vsize]] exact : i32 to index
     // CHECK: [[v3:%.*]] = arith.cmpi eq, [[v2]], [[vc4]] : index
     // CHECK: cf.assert [[v3]]
     // CHECK: [[valloc:%.*]] = memref.alloc() : memref<4x3x4xf32>
@@ -227,7 +227,7 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 7> } {
     // CHECK: [[v1:%.*]] = mpi.comm_world : !mpi.comm
     // CHECK: [[vnewcomm:%.*]] = mpi.comm_split([[v1]], [[vc1_i32]], [[vc2_i32]]) : !mpi.comm
     // CHECK: [[vsize:%.*]] = mpi.comm_size([[vnewcomm]]) : i32
-    // CHECK: [[v2:%.*]] = arith.index_cast [[vsize]] : i32 to index
+    // CHECK: [[v2:%.*]] = arith.index_cast [[vsize]] exact : i32 to index
     // CHECK: [[v3:%.*]] = arith.cmpi eq, [[v2]], [[vc5]] : index
     // CHECK: cf.assert [[v3]]
     // CHECK: [[valloc:%.*]] = memref.alloc() : memref<5x3x4xf32>
@@ -252,7 +252,7 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 7> } {
     // CHECK: [[v0:%.*]] = mpi.comm_world : !mpi.comm
     // CHECK: [[vnewcomm:%.*]] = mpi.comm_split([[v0]], [[vc1_i32]], [[vc2_i32]]) : !mpi.comm
     // CHECK: [[vsize:%.*]] = mpi.comm_size([[vnewcomm]]) : i32
-    // CHECK: [[v1:%.*]] = arith.index_cast [[vsize]] : i32 to index
+    // CHECK: [[v1:%.*]] = arith.index_cast [[vsize]] exact : i32 to index
     // CHECK: [[v2:%.*]] = arith.cmpi eq, [[v1]], [[vc5]] : index
     // CHECK: cf.assert [[v2]]
     // CHECK: [[valloc:%.*]] = memref.alloc() : memref<5x3x4xf32>
@@ -509,7 +509,7 @@ func.func @mlp_1dgrid(%arg0: tensor<512x512xf32>, %arg1: tensor<2048x256xf32>, %
   // CHECK: [[v0:%.*]] = bufferization.to_buffer [[varg0]] : tensor<512x512xf32> to memref<512x512xf32>
   // CHECK: [[v1:%.*]] = mpi.comm_world : !mpi.comm
   // CHECK: [[vsize:%.*]] = mpi.comm_size([[v1]]) : i32
-  // CHECK: [[v2:%.*]] = arith.index_cast [[vsize]] : i32 to index
+  // CHECK: [[v2:%.*]] = arith.index_cast [[vsize]] exact : i32 to index
   // CHECK: [[v3:%.*]] = arith.cmpi eq, [[v2]], [[vc4]] : index
   // CHECK: cf.assert [[v3]]
   // CHECK: [[valloc:%.*]] = memref.alloc() : memref<4x512x512xf32>
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_1d.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_1d.mlir
index aebec7fc27c78..1b563a9bfc8eb 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_1d.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_1d.mlir
@@ -10,11 +10,11 @@ gpu.module @load_store_check {
         // CHECK: %[[SRCCE:.*]] = memref.memory_space_cast %[[SRC]] : memref<512xf32, 1> to memref<512xf32>
         %srcce = memref.memory_space_cast %src : memref<512xf32, 1> to memref<512xf32>
         // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[SRCCE]] : memref<512xf32> -> index
-        // CHECK: %[[INTPTR_I64:.*]] = arith.index_castui %[[INTPTR]] : index to i64
+        // CHECK: %[[INTPTR_I64:.*]] = arith.index_castui %[[INTPTR]] exact : index to i64
         // CHECK: %[[DSTTE:.*]] = memref.memory_space_cast %[[DST]] : memref<256xf32, 1> to memref<256xf32>
         %dstte = memref.memory_space_cast %dst : memref<256xf32, 1> to memref<256xf32>
         // CHECK: %[[INTPTR1:.*]] = memref.extract_aligned_pointer_as_index %[[DSTTE]] : memref<256xf32> -> index
-        // CHECK: %[[INTPTR1_I64:.*]] = arith.index_castui %[[INTPTR1]] : index to i64
+        // CHECK: %[[INTPTR1_I64:.*]] = arith.index_castui %[[INTPTR1]] exact : index to i64
 
         %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<512xf32> -> !xegpu.tensor_desc<32xf32>
         // CHECK: %[[ADDR:.*]] = arith.addi %[[INTPTR_I64]], %[[C384]] : i64
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_sub_byte.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_sub_byte.mlir
index 97e5ce14f8539..d0d78697326c6 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_sub_byte.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_sub_byte.mlir
@@ -11,11 +11,11 @@ gpu.module @load_store_check {
         // CHECK: %[[C128_I32:.*]] = arith.constant 128 : i32
         // CHECK: %[[SRCCE:.*]] = memref.memory_space_cast %[[ARG0]]
         // CHECK: %[[SRCINDEX:.*]] = memref.extract_aligned_pointer_as_index %[[SRCCE]]
-        // CHECK: %[[SRCPTR64:.*]] = arith.index_castui %[[SRCINDEX]] : index to i64
+        // CHECK: %[[SRCPTR64:.*]] = arith.index_castui %[[SRCINDEX]] exact : index to i64
         %srcce = memref.memory_space_cast %src : memref<16x128xi4, 1> to memref<16x128xi4>
         // CHECK: %[[DSTTE:.*]] = memref.memory_space_cast %[[ARG1]]
         // CHECK: %[[DSTINDEX:.*]] = memref.extract_aligned_pointer_as_index %[[DSTTE]]
-        // CHECK: %[[DSTPTR64:.*]] = arith.index_castui %[[DSTINDEX]] : index to i64
+        // CHECK: %[[DSTPTR64:.*]] = arith.index_castui %[[DSTINDEX]] exact : index to i64
         %dstte = memref.memory_space_cast %dst : memref<16x128xi4, 1> to memref<16x128xi4>
 
         // CHECK: %[[PAYLOAD_SRC:.*]] = vector.insert %[[SRCPTR64]], %[[CST]] [0] : i64 into vector<4xi64>
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
index 4c84699a069f0..2ff956901e11c 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
@@ -10,7 +10,7 @@ gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>,
   // CHECK: %[[C2_I64:.*]] = arith.constant 2 : i64
   // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG3]][0] : i1 from vector<1xi1>
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
-  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
+  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] exact : index to i64
   // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C2_I64]] : i64
   // CHECK: %[[VAR4:.*]] = arith.addi %[[ARG0]], %[[VAR3]] : i64
   // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1>
@@ -56,7 +56,7 @@ gpu.func @store_scatter_i64_src_value_offset(%src: i64, %offset: vector<1xindex>
   // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
   // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG2]][0] : i1 from vector<1xi1>
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
-  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
+  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] exact : index to i64
   %0 = arith.constant dense<2.9>: vector<1xf32>
   // CHECK: %[[VAR4:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
   // CHECK: %[[VAR5:.*]] = arith.addi %[[ARG0]], %[[VAR4]] : i64
@@ -77,7 +77,7 @@ gpu.module @test {
 gpu.func @prefetch_i64_src_value_offset(%src: i64, %offset: vector<1xindex>) {
   // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
-  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
+  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] exact : index to i64
   // CHECK: %[[VAR2:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
   // CHECK: %[[VAR3:.*]] = arith.addi %[[ARG0]], %[[VAR2]] : i64
   // CHECK: %[[VAR4:.*]] = llvm.inttoptr %[[VAR3]] : i64 to !llvm.ptr<1>
@@ -95,9 +95,9 @@ gpu.module @test {
 gpu.func @prefetch_memref_src_value_offset(%src: memref<256xf32>, %offset: vector<1xindex>) {
   // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
   // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
-  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
+  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] exact : index to i64
   // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<256xf32> -> index
-  // CHECK: %[[VAR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
+  // CHECK: %[[VAR2:.*]] = arith.index_castui %[[INTPTR]] exact : index to i64
   // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
   // CHECK: %[[VAR4:.*]] = arith.addi %[[VAR2]], %[[VAR3]] : i64
   // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1>
@@ -118,8 +118,8 @@ gpu.func @load_gather_from_dyn_memref_subview(%dyn: memref<?xf16>, %offset: vect
 
   // CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]], %[[STRIDES:.*]] = memref.extract_strided_metadata %{{.*}} : memref<16xf16, strided<[1], offset: ?>> -> memref<f16>, index, index, index
   // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[BASE]] : memref<f16> -> index
-  // CHECK: %[[CAST1:.*]] = arith.index_castui %[[INTPTR]] : index to i64
-  // CHECK: %[[CAST2:.*]] = arith.index_castui %[[OFFSET]] : index to i64
+  // CHECK: %[[CAST1:.*]] = arith.index_castui %[[INTPTR]] exact : index to i64
+  // CHECK: %[[CAST2:.*]] = arith.index_castui %[[OFFSET]] exact : index to i64
   // CHECK: %[[MUL1:.*]] = arith.muli %[[CAST2]], %{{.*}} : i64
   // CHECK: %[[ADD1:.*]] = arith.addi %[[CAST1]], %[[MUL1]] : i64
   // CHECK: %[[MUL2:.*]] = arith.muli %{{.*}}, %{{.*}} : i64
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 326afcae696cc..51e29214d1c24 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -580,7 +580,7 @@ func.func @orOfExtUI_nneg_mixed(%arg0: i8, %arg1: i8) -> i64 {
 // -----
 
 // CHECK-LABEL: @indexCastOfSignExtend
-//       CHECK:   %[[res:.+]] = arith.index_cast %arg0 : i8 to index
+//       CHECK:   %[[res:.+]] = arith.index_cast %arg0 exact : i8 to index
 //       CHECK:   return %[[res]]
 func.func @indexCastOfSignExtend(%arg0: i8) -> index {
   %ext = arith.extsi %arg0 : i8 to i16
@@ -598,7 +598,7 @@ func.func @indexCastOfSignExtend_exact(%arg0: i8) -> index {
 }
 
 // CHECK-LABEL: @indexCastUIOfUnsignedExtend
-//       CHECK:   %[[res:.+]] = arith.index_castui %arg0 : i8 to index
+//       CHECK:   %[[res:.+]] = arith.index_castui %arg0 exact : i8 to index
 //       CHECK:   return %[[res]]
 func.func @indexCastUIOfUnsignedExtend(%arg0: i8) -> index {
   %ext = arith.extui %arg0 : i8 to i16
@@ -607,7 +607,7 @@ func.func @indexCastUIOfUnsignedExtend(%arg0: i8) -> index {
 }
 
 // CHECK-LABEL: @indexCastUIOfUnsignedExtend_nneg_on_extui
-//       CHECK:   %[[res:.+]] = arith.index_castui %arg0 nneg : i8 to index
+//       CHECK:   %[[res:.+]] = arith.index_castui %arg0 exact nneg : i8 to index
 //       CHECK:   return %[[res]]
 func.func @indexCastUIOfUnsignedExtend_nneg_on_extui(%arg0: i8) -> index {
   %ext = arith.extui %arg0 nneg : i8 to i16
@@ -616,7 +616,7 @@ func.func @indexCastUIOfUnsignedExtend_nneg_on_extui(%arg0: i8) -> index {
 }
 
 // CHECK-LABEL: @indexCastUIOfUnsignedExtend_nneg_on_castui
-//       CHECK:   %[[res:.+]] = arith.index_castui %arg0 : i8 to index
+//       CHECK:   %[[res:.+]] = arith.index_castui %arg0 exact : i8 to index
 //   CHECK-NOT:   nneg
 //       CHECK:   return %[[res]]
 func.func @indexCastUIOfUnsignedExtend_nneg_on_castui(%arg0: i8) -> index {
@@ -647,18 +647,18 @@ func.func @indexCastUIOfUnsignedExtend_nneg_exact(%arg0: i8) -> index {
 // CHECK-LABEL: @indexCastUIOfIndexCastUI_no_exact
 //       CHECK:   arith.index_castui
 //       CHECK:   arith.index_castui
-func.func @indexCastUIOfIndexCastUI_no_exact(%arg0: i32) -> i32 {
-  %idx = arith.index_castui %arg0 : i32 to index
-  %res = arith.index_castui %idx : index to i32
-  return %res : i32
+func.func @indexCastUIOfIndexCastUI_no_exact(%arg0: i128) -> i128 {
+  %idx = arith.index_castui %arg0 : i128 to index
+  %res = arith.index_castui %idx : index to i128
+  return %res : i128
 }
 
 // CHECK-LABEL: @indexCastUIOfIndexCastUI_exact_inner
-//       CHECK:   return %arg0 : i32
-func.func @indexCastUIOfIndexCastUI_exact_inner(%arg0: i32) -> i32 {
-  %idx = arith.index_castui %arg0 exact : i32 to index
-  %res = arith.index_castui %idx : index to i32
-  return %res : i32
+//       CHECK:   return %arg0 : i128
+func.func @indexCastUIOfIndexCastUI_exact_inner(%arg0: i128) -> i128 {
+  %idx = arith.index_castui %arg0 exact : i128 to index
+  %res = arith.index_castui %idx : index to i128
+  return %res : i128
 }
 
 // exact on outer only does NOT trigger the fold (outer exact on widening
@@ -666,10 +666,10 @@ func.func @indexCastUIOfIndexCastUI_exact_inner(%arg0: i32) -> i32 {
 // CHECK-LABEL: @indexCastUIOfIndexCastUI_exact_outer
 //       CHECK:   arith.index_castui
 //       CHECK:   arith.index_castui
-func.func @indexCastUIOfIndexCastUI_exact_outer(%arg0: i32) -> i32 {
-  %idx = arith.index_castui %arg0 : i32 to index
-  %res = arith.index_castui %idx exact : index to i32
-  return %res : i32
+func.func @indexCastUIOfIndexCastUI_exact_outer(%arg0: i128) -> i128 {
+  %idx = arith.index_castui %arg0 : i128 to index
+  %res = arith.index_castui %idx exact : index to i128
+  return %res : i128
 }
 
 // CHECK-LABEL: @indexCastUIOfIndexCastUI_exact_both
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 465e8fdd66422..a054cc49871b2 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -792,12 +792,12 @@ func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
       //       CHECK:     %[[LIN_W:.*]] = affine.apply #[[$MAP_LIN_W]]()[%[[TIDX]], %[[TIDY]]]
       //
       // Compute the active warps below using the mask + popcnt
-      //       CHECK:     %[[LIN_W_i64:.*]] = arith.index_castui %[[LIN_W]] : index to i64
+      //       CHECK:     %[[LIN_W_i64:.*]] = arith.index_castui %[[LIN_W]] exact : index to i64
       //       CHECK:     %[[TWO_POW_W:.*]] = arith.shli %[[C1_i64]], %[[LIN_W_i64]] : i64
       //       CHECK:     %[[FILTER_TILL_W:.*]] = arith.subi %[[TWO_POW_W]], %[[C1_i64]] : i64
       //       CHECK:     %[[ACTIVE_TILL_W:.*]] = arith.andi %[[FILTER_TILL_W]], %[[C753_i64]] : i64
       //       CHECK:     %[[LOGICAL_ID_W_i64:.*]] = math.ctpop %[[ACTIVE_TILL_W]] : i64
-      //       CHECK:     %[[LOGICAL_ID_W:.*]] = arith.index_castui %[[LOGICAL_ID_W_i64]] : i64 to index
+      //       CHECK:     %[[LOGICAL_ID_W:.*]] = arith.index_castui %[[LOGICAL_ID_W_i64]] exact : i64 to index
       //
       // Dynamically compute whether this warp is active below using the mask + popcnt
       //       CHECK:     %[[IS_ACTIVE_W_MASK:.*]] = arith.andi %[[TWO_POW_W]], %[[C753_i64]] : i64
diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir
index 22add87ff3ed4..ad703169c112b 100644
--- a/mlir/test/Dialect/Shape/canonicalize.mlir
+++ b/mlir/test/Dialect/Shape/canonicalize.mlir
@@ -1419,7 +1419,7 @@ func.func @shape_of_from_reshape(%arg0: tensor<*xf32>, %arg1: tensor<?xindex>) -
 // CHECK-SAME: %[[INPUT:.*]]: tensor<?x1xf32>
 // CHECK-SAME: %[[SHAPE:.*]]: tensor<3xi32>
 func.func @shape_of_from_reshape_int_to_index(%arg0: tensor<?x1xf32>, %arg1: tensor<3xi32>) -> tensor<3xindex> {
-  // CHECK: %[[CAST_SHAPE:.*]] = arith.index_cast %[[SHAPE]] : tensor<3xi32> to tensor<3xindex>
+  // CHECK: %[[CAST_SHAPE:.*]] = arith.index_cast %[[SHAPE]] exact : tensor<3xi32> to tensor<3xindex>
   // CHECK: return %[[CAST_SHAPE]] : tensor<3xindex>
     %0 = tensor.reshape %arg0(%arg1) : (tensor<?x1xf32>, tensor<3xi32>) -> tensor<?x1x1xf32>
     %1 = shape.shape_of %0 : tensor<?x1x1xf32> -> tensor<3xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
index 35fd7c33e4cfe..6cd71a101246b 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
@@ -26,10 +26,10 @@
 // CHECK-DAG:   %[[mask:.*]] = arith.constant dense<true> : vector<16xi1>
 // CHECK:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
 // CHECK:       %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK:       %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
+// CHECK:       %[[q:.*]] = arith.index_cast %[[a]] exact : i64 to index
 // CHECK:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
 // CHECK:       %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK:       %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
+// CHECK:       %[[s:.*]] = arith.index_cast %[[b]] exact : i64 to index
 // CHECK:       %[[boundary:.*]] = affine.apply #[[$map0]]()[%[[q]], %[[s]]]
 // CHECK:       scf.for %[[i:.*]] = %[[q]] to %[[boundary]] step %[[c16]] {
 // CHECK:         %[[li:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xi32>, vector<16xi32>
diff --git a/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir b/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir
index 8df8ac6b1675c..a9c35a90af3d9 100644
--- a/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir
+++ b/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir
@@ -18,7 +18,7 @@ func.func @sparse_metadata_init() -> !sparse_tensor.storage_specifier<#CSR> {
 // CHECK-LABEL:   func.func @sparse_get_md(
 // CHECK-SAME:      %[[VAL_0:.*]]: !llvm.struct<(array<2 x i64>, array<3 x i64>)>) -> index {
 // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0, 0] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
-// CHECK:           %[[CAST:.*]] = arith.index_cast %[[VAL_1]] : i64 to index
+// CHECK:           %[[CAST:.*]] = arith.index_cast %[[VAL_1]] exact : i64 to index
 // CHECK:           return %[[CAST]] : index
 func.func @sparse_get_md(%arg0: !sparse_tensor.storage_specifier<#CSR>) -> index {
   %0 = sparse_tensor.storage_specifier.get %arg0 lvl_sz at 0
@@ -29,7 +29,7 @@ func.func @sparse_get_md(%arg0: !sparse_tensor.storage_specifier<#CSR>) -> index
 // CHECK-LABEL:   func.func @sparse_set_md(
 // CHECK-SAME:      %[[VAL_0:.*]]: !llvm.struct<(array<2 x i64>, array<3 x i64>)>,
 // CHECK-SAME:      %[[VAL_1:.*]]: index) -> !llvm.struct<(array<2 x i64>, array<3 x i64>)> {
-// CHECK:           %[[CAST:.*]] = arith.index_cast %[[VAL_1]] : index to i64
+// CHECK:           %[[CAST:.*]] = arith.index_cast %[[VAL_1]] exact : index to i64
 // CHECK:           %[[VAL_2:.*]] = llvm.insertvalue %[[CAST]], %[[VAL_0]][0, 0] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
 // CHECK:           return %[[VAL_2]] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
 func.func @sparse_set_md(%arg0: !sparse_tensor.storage_specifier<#CSR>, %arg1: index)
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 278f02ed033ab..39f5ec48deaaa 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -1535,7 +1535,7 @@ func.func @vector_insert_strided_slice_2d_to_2d(%laneid: index) -> (vector<64x1x
 //       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]]
 //       CHECK-PROP:     %[[GATHER:.*]] = vector.gather %[[AR1]][{{.*}}]
 //       CHECK-PROP:     %[[EXTRACT:.*]] = vector.shape_cast %[[GATHER]] : vector<1x64xi32> to vector<64xi32>
-//       CHECK-PROP:     %[[CAST:.*]] = arith.index_cast %[[EXTRACT]] : vector<64xi32> to vector<64xindex>
+//       CHECK-PROP:     %[[CAST:.*]] = arith.index_cast %[[EXTRACT]] exact : vector<64xi32> to vector<64xindex>
 //       CHECK-PROP:     %[[EXTRACTELT:.*]] = vector.extract %[[CAST]][{{.*}}] : index from vector<64xindex>
 //       CHECK-PROP:     gpu.yield %[[EXTRACTELT]] : index
 //       CHECK-PROP:   %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[THREADID]]]
@@ -1555,7 +1555,7 @@ func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 :  memref<1
   ^bb0(%arg4: vector<1x64xindex>):
     %28 = vector.gather %ar1[%c0, %c0, %c0] [%arg4], %cst_0, %cst : memref<1x4x2xi32>, vector<1x64xindex>, vector<1x64xi1>, vector<1x64xi32> into vector<1x64xi32>
     %29 = vector.extract %28[0] : vector<64xi32> from vector<1x64xi32>
-    %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex>
+    %30 = arith.index_cast %29 exact : vector<64xi32> to vector<64xindex>
     %36 = vector.extract %30[%c0_i32] : index from vector<64xindex>
     %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32>
     gpu.yield %37 : vector<64xf32>
diff --git a/mlir/test/Dialect/XeGPU/peephole-optimize.mlir b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir
index 83fec045b9973..95eb1f0138a17 100644
--- a/mlir/test/Dialect/XeGPU/peephole-optimize.mlir
+++ b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir
@@ -6,7 +6,7 @@
 // CHECK:         %[[C16:.*]] = arith.constant 16 : index
 // CHECK:         %[[C32:.*]] = arith.constant 32 : index
 // CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xf16> -> index
-// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] exact : index to i64
 // CHECK:         %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64
 // CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
 // CHECK-NEXT:    %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]]
@@ -35,7 +35,7 @@ gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> {
 // CHECK:         %[[C16:.*]] = arith.constant 16 : index
 // CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index
-// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] exact : index to i64
 // CHECK:         %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64
 // CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
 // CHECK:         %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]]
@@ -68,7 +68,7 @@ gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8
 // CHECK:           %[[C16:.*]] = arith.constant 16 : index
 // CHECK:           %[[C256:.*]] = arith.constant 256 : index
 // CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
-// CHECK:           %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T3:.*]] = arith.index_cast %[[PTR]] exact : index to i64
 // CHECK:           %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1]
 // CHECK-SAME:        : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
 // CHECK:           %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
@@ -111,7 +111,7 @@ gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16
 // CHECK:          %[[C256:.*]] = arith.constant 256 : index
 // CHECK:          scf.for %{{.*}} to %{{.*}} step %{{.*}} {
 // CHECK:            %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
-// CHECK:            %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:            %[[T3:.*]] = arith.index_cast %[[PTR]] exact : index to i64
 // CHECK:            %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
 // CHECK-SAME:          -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
 // CHECK:            %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
@@ -155,7 +155,7 @@ gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %ar
 // CHECK:           %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32>
 // CHECK:           %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
-// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] exact : index to i64
 // CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
 // CHECK-SAME:        -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
 // CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
@@ -226,7 +226,7 @@ gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2
 // CHECK:           %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] :
 // CHECK-SAME:        memref<256x256xf16> -> index
-// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] exact : index to i64
 // CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]],
 // CHECK-SAME:        strides : [%[[C128]], 1] : i64 ->
 // CHECK-SAME:        !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index dde58ba31860d..fd3ef7fccb284 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -516,7 +516,7 @@ gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>, %laneid: index) {
 // CHECK:         gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16>
 // CHECK-NEXT:  }
 // CHECK-NEXT:  %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index
-// CHECK-NEXT:  arith.index_cast %[[INTPTR]] : index to i64
+// CHECK-NEXT:  arith.index_cast %[[INTPTR]] exact : index to i64
 gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
     %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
@@ -530,7 +530,7 @@ gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %
 // CHECK-LABEL: gpu.func @memref_alloca(
 // CHECK-NEXT:    %[[ALLOCA:.*]] = memref.alloca() : memref<2048xi8, 3>
 // CHECK-NEXT:    %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ALLOCA]] : memref<2048xi8, 3> -> index
-// CHECK-NEXT:    %[[CAST:.*]] = arith.index_cast %[[INTPTR]] : index to i64
+// CHECK-NEXT:    %[[CAST:.*]] = arith.index_cast %[[INTPTR]] exact : index to i64
 gpu.func @memref_alloca(%laneid: index) {
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (memref<2048xi8, 3>) {
     %alloca = memref.alloca() : memref<2048xi8, 3>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir b/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir
index 0bb7d7d3d8b1b..209ef056b28c9 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir
@@ -91,7 +91,7 @@ func.func @test_vector_store_load_4x4x4(%buffer: memref<4x4x4xf32>) {
 // CHECK: %[[ADDI:.*]] = arith.addi %[[CAST2]], %[[CST]] : vector<4xindex>
 // CHECK: %[[INDEX_CAST1:.*]] = arith.index_cast %[[ADDI]] : vector<4xindex> to vector<4xi32>
 // CHECK: %[[MULI:.*]] = arith.muli %[[INDEX_CAST1]], %[[CAST1]] : vector<4xi32>
-// CHECK: %[[INDEX_CAST2:.*]] = arith.index_cast %[[MULI]] : vector<4xi32> to vector<4xindex>
+// CHECK: %[[INDEX_CAST2:.*]] = arith.index_cast %[[MULI]] exact : vector<4xi32> to vector<4xindex>
 // CHECK: %[[RESULT:.*]] = vector.shape_cast %[[INDEX_CAST2]] : vector<4xindex> to vector<2x2xindex>
 // CHECK: return %[[RESULT]] : vector<2x2xindex>
 func.func @test_linearize_index(%arg0: vector<2x2xindex>, %arg1: vector<2x2xi32>) -> vector<2x2xindex> {