diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index fd8c43cc88a19..4d4d30e69cdd7 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -22,6 +22,7 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 98f947a1f635d..6c9e0648fede8 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -411,6 +411,13 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
 
   // Add codegen pass pipeline.
   fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename);
+
+  // Run a pass to prepare for translation of delayed privatization in the
+  // context of deferred target tasks.
+  addNestedPassConditionally<mlir::LLVM::LLVMFuncOp>(
+      pm, disableFirToLlvmIr, [&]() {
+        return mlir::LLVM::createPrepareForOMPOffloadPrivatizationPass();
+      });
 }
 
 } // namespace fir
diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir
index 75f8f153127af..177810cf41378 100644
--- a/flang/test/Driver/tco-emit-final-mlir.fir
+++ b/flang/test/Driver/tco-emit-final-mlir.fir
@@ -13,7 +13,7 @@
 // CHECK: llvm.return
 // CHECK-NOT: func.func
 
-func.func @_QPfoo() {
+func.func @_QPfoo() -> !fir.ref<i32> {
   %1 = fir.alloca i32
-  return
+  return %1 : !fir.ref<i32>
 }
diff --git a/flang/test/Driver/tco-test-gen.fir b/flang/test/Driver/tco-test-gen.fir
index 38d4e50ecf3aa..15483f7ee3534 100644
--- a/flang/test/Driver/tco-test-gen.fir
+++ b/flang/test/Driver/tco-test-gen.fir
@@ -42,11 +42,10 @@ func.func @_QPtest(%arg0: !fir.ref<i32> {fir.bindc_name = "num"}, %arg1: !fir.re
 // CHECK-SAME:      %[[ARG2:.*]]: !llvm.ptr {fir.bindc_name = "ub", llvm.nocapture},
 // CHECK-SAME:      %[[ARG3:.*]]: !llvm.ptr {fir.bindc_name = "step", llvm.nocapture}) {
 
+// CMPLX:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
+// CMPLX:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
 // CMPLX:           %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CMPLX:           %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
-// CMPLX:           %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
-// CMPLX:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
-// CMPLX:           %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
 
 // SIMPLE:          %[[VAL_3:.*]] = llvm.mlir.constant(0 : index) : i64
 // SIMPLE:          %[[VAL_2:.*]] = llvm.mlir.constant(1 : index) : i64
diff --git a/flang/test/Fir/alloc-32.fir b/flang/test/Fir/alloc-32.fir
index a3cbf200c24fc..f57f6ce6fcf5e 100644
--- a/flang/test/Fir/alloc-32.fir
+++ b/flang/test/Fir/alloc-32.fir
@@ -19,7 +19,7 @@ func.func @allocmem_scalar_nonchar() -> !fir.heap<i32> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
+// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 1
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[sz:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: %[[trunc:.*]] = trunc i64 %[[sz]] to i32
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 8da8b828c18b9..0d3ce323d0d7c 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -86,7 +86,7 @@ func.func @alloca_scalar_dynchar_kind(%l : i32) -> !fir.ref<!fir.char<2,?>> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
+// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 1
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -98,7 +98,7 @@ func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
 // CHECK-LABEL: define ptr @allocmem_scalar_dynchar_kind(
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[mul2:.*]] = mul i64 2, %[[mul1]]
+// CHECK: %[[mul2:.*]] = mul i64 %[[mul1]], 2
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -185,7 +185,7 @@ func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref<!fir.array<?x?xi32
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 12, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 12
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -196,7 +196,7 @@ func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap<!fir.array<3x?xi
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar2(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 4, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 4
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod2]], i64 1
@@ -227,7 +227,7 @@ func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref<!fir.array<?x?x!fir.
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 60, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 60
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
 // CHECK: call ptr @malloc(i64 %[[size]])
@@ -238,7 +238,7 @@ func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap<!fir.array<3x?x!fi
 
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char2(
 // CHECK-SAME: i64 %[[extent:.*]])
-// CHECK: %[[prod1:.*]] = mul i64 20, %[[extent]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 20
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
 // CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
@@ -286,7 +286,7 @@ func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap<!fir.a
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_dynchar2(
 // CHECK-SAME: i32 %[[len:.*]], i64 %[[extent:.*]])
 // CHECK: %[[a:.*]] = sext i32 %[[len]] to i64
-// CHECK: %[[prod1:.*]] = mul i64 2, %[[a]]
+// CHECK: %[[prod1:.*]] = mul i64 %[[a]], 2
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[prod3:.*]] = mul i64 %[[prod2]], %[[extent]]
 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod3]], 0
@@ -366,12 +366,13 @@ func.func @allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir
 // CHECK:    %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i64 1
 // CHECK:    %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
-
+func.func private @foo(%0: !fir.ref<!fir.class<none>>, %1: !fir.ref<!fir.class<!fir.array<?xnone>>>, %2: !fir.ref<!fir.box<none>>, %3: !fir.ref<!fir.box<!fir.array<?xnone>>>)
 func.func @alloca_unlimited_polymorphic_box() {
   %0 = fir.alloca !fir.class<none>
   %1 = fir.alloca !fir.class<!fir.array<?xnone>>
   %2 = fir.alloca !fir.box<none>
   %3 = fir.alloca !fir.box<!fir.array<?xnone>>
+  fir.call @foo(%0, %1, %2, %3) : (!fir.ref<!fir.class<none>>, !fir.ref<!fir.class<!fir.array<?xnone>>>, !fir.ref<!fir.box<none>>, !fir.ref<!fir.box<!fir.array<?xnone>>>) -> ()
   return
 }
 // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not
diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir
index e8ec8ac79e0c2..2eb717228d998 100644
--- a/flang/test/Fir/arrexp.fir
+++ b/flang/test/Fir/arrexp.fir
@@ -143,9 +143,9 @@ func.func @f6(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: f32) {
   %c9 = arith.constant 9 : index
   %c10 = arith.constant 10 : index
 
-  // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i64 0, i32 1
+  // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i32 0, i32 1
   // CHECK: %[[EXTENT:.*]] = load i64, ptr %[[EXT_GEP]]
-  // CHECK: %[[SIZE:.*]] = mul i64 4, %[[EXTENT]]
+  // CHECK: %[[SIZE:.*]] = mul i64 %[[EXTENT]], 4
   // CHECK: %[[CMP:.*]] = icmp sgt i64 %[[SIZE]], 0
   // CHECK: %[[SZ:.*]] = select i1 %[[CMP]], i64 %[[SIZE]], i64 1
   // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SZ]])
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index c9fe53bf093a1..6bad03dded24d 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -158,4 +158,6 @@ func.func @_QQmain() {
 // PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
+// PASSES-NEXT: 'llvm.func' Pipeline
+// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir
index c0cf3d8375983..760fbd4792122 100644
--- a/flang/test/Fir/box.fir
+++ b/flang/test/Fir/box.fir
@@ -57,7 +57,7 @@ func.func @fa(%a : !fir.ref<!fir.array<100xf32>>) {
 // CHECK-SAME: ptr {{[^%]*}}%[[res:.*]], ptr {{[^%]*}}%[[arg0:.*]], i64 %[[arg1:.*]])
 func.func @b1(%arg0 : !fir.ref<!fir.char<1,?>>, %arg1 : index) -> !fir.box<!fir.char<1,?>> {
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }
-  // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]]
+  // CHECK: %[[size:.*]] = mul i64 %[[arg1]], 1
   // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0
@@ -89,7 +89,7 @@ func.func @b2(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,5>>>, %arg1 : index) ->
 func.func @b3(%arg0 : !fir.ref<!fir.array<?x!fir.char<1,?>>>, %arg1 : index, %arg2 : index) -> !fir.box<!fir.array<?x!fir.char<1,?>>> {
   %1 = fir.shape %arg2 : (index) -> !fir.shape<1>
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-  // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]]
+  // CHECK: %[[size:.*]] = mul i64 %[[arg1]], 1
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 %[[arg2]], 7, 0, 1
@@ -108,7 +108,7 @@ func.func @b4(%arg0 : !fir.ref<!fir.array<7x!fir.char<1,?>>>, %arg1 : index) ->
   %c_7 = arith.constant 7 : index
   %1 = fir.shape %c_7 : (index) -> !fir.shape<1>
   // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
-  // CHECK:   %[[size:.*]] = mul i64 1, %[[arg1]]
+  // CHECK:   %[[size:.*]] = mul i64 %[[arg1]], 1
   // CHECK: insertvalue {{.*}} i64 %[[size]], 1
   // CHECK: insertvalue {{.*}} i32 20240719, 2
   // CHECK: insertvalue {{.*}} i64 7, 7, 0, 1
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 97d9b38ed6f40..d4c36a4f5b213 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -82,12 +82,8 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 // CHECK:         store [1 x i8] c" ", ptr %[[VAL_18]], align 1
 // CHECK:         call void @llvm.init.trampoline(ptr %[[VAL_20]], ptr @_QFtest_proc_dummy_charPgen_message, ptr %[[VAL_2]])
 // CHECK:         %[[VAL_23:.*]] = call ptr @llvm.adjust.trampoline(ptr %[[VAL_20]])
-// CHECK:         %[[VAL_25:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_23]], 0
-// CHECK:         %[[VAL_26:.*]] = insertvalue { ptr, i64 } %[[VAL_25]], i64 10, 1
 // CHECK:         %[[VAL_27:.*]] = call ptr @llvm.stacksave.p0()
-// CHECK:         %[[VAL_28:.*]] = extractvalue { ptr, i64 } %[[VAL_26]], 0
-// CHECK:         %[[VAL_29:.*]] = extractvalue { ptr, i64 } %[[VAL_26]], 1
-// CHECK:         %[[VAL_30:.*]] = call { ptr, i64 } @_QPget_message(ptr %[[VAL_0]], i64 40, ptr %[[VAL_28]], i64 %[[VAL_29]])
+// CHECK:         %[[VAL_30:.*]] = call { ptr, i64 } @_QPget_message(ptr %[[VAL_0]], i64 40, ptr %[[VAL_23]], i64 10)
 // CHECK:         %[[VAL_32:.*]] = call i1 @_FortranAioOutputAscii(ptr %{{.*}}, ptr %[[VAL_0]], i64 40)
 // CHECK:         call void @llvm.stackrestore.p0(ptr %[[VAL_27]])
 
@@ -115,14 +111,10 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 // CHECK-LABEL: define { ptr, i64 } @_QPget_message(ptr
 // CHECK-SAME:                  %[[VAL_0:.*]], i64 %[[VAL_1:.*]], ptr %[[VAL_2:.*]], i64
 // CHECK-SAME:                                                 %[[VAL_3:.*]])
-// CHECK:         %[[VAL_4:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_2]], 0
-// CHECK:         %[[VAL_5:.*]] = insertvalue { ptr, i64 } %[[VAL_4]], i64 %[[VAL_3]], 1
-// CHECK:         %[[VAL_7:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 0
-// CHECK:         %[[VAL_8:.*]] = extractvalue { ptr, i64 } %[[VAL_5]], 1
 // CHECK:         %[[VAL_9:.*]] = call ptr @llvm.stacksave.p0()
-// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_8]], align 1
-// CHECK:         %[[VAL_12:.*]] = call { ptr, i64 } %[[VAL_7]](ptr %[[VAL_10]], i64 %[[VAL_8]])
-// CHECK:         %[[VAL_13:.*]] = add i64 %[[VAL_8]], 12
+// CHECK:         %[[VAL_10:.*]] = alloca i8, i64 %[[VAL_3]], align 1
+// CHECK:         %[[VAL_12:.*]] = call { ptr, i64 } %[[VAL_2]](ptr %[[VAL_10]], i64 %[[VAL_3]])
+// CHECK:         %[[VAL_13:.*]] = add i64 %[[VAL_3]], 12
 // CHECK:         %[[VAL_14:.*]] = alloca i8, i64 %[[VAL_13]], align 1
 // CHECK:         call void @llvm.memmove.p0.p0.i64(ptr %[[VAL_14]], ptr {{.*}}, i64 12, i1 false)
 // CHECK:         %[[VAL_18:.*]] = phi i64
diff --git a/flang/test/Fir/embox.fir b/flang/test/Fir/embox.fir
index 0f304cff2c79e..11f7457b6873c 100644
--- a/flang/test/Fir/embox.fir
+++ b/flang/test/Fir/embox.fir
@@ -11,7 +11,7 @@ func.func @_QPtest_callee(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @_QPtest_slice() {
 // CHECK:  %[[a1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 // CHECK:  %[[a2:.*]] = alloca [20 x i32], i64 1, align 4
-// CHECK:  %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i64 0, i64 0
+// CHECK:  %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i32 0, i64 0
 // CHECK:  %[[a4:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
 // CHECK:  { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK: [i64 1, i64 5, i64 8]] }, ptr %[[a3]], 0
@@ -38,7 +38,7 @@ func.func @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>) {
 func.func @_QPtest_dt_slice() {
 // CHECK:  %[[a1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 // CHECK:  %[[a3:.*]] = alloca [20 x %_QFtest_dt_sliceTt], i64 1, align 8
-// CHECK:  %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i64 0, i64 0, i32 0
+// CHECK:  %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i32 0, i64 0, i32 0
 // CHECK: %[[a5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }
 // CHECK-SAME: { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]]
 // CHECK-SAME: [i64 1, i64 5, i64 16
@@ -73,7 +73,7 @@ func.func @emboxSubstring(%arg0: !fir.ref<!fir.array<2x3x!fir.char<1,4>>>) {
   %0 = fir.shape %c2, %c3 : (index, index) -> !fir.shape<2>
   %1 = fir.slice %c1, %c2, %c1, %c1, %c3, %c1 substr %c1_i64, %c2_i64 : (index, index, index, index, index, index, i64, i64) -> !fir.slice<2>
   %2 = fir.embox %arg0(%0) [%1] : (!fir.ref<!fir.array<2x3x!fir.char<1,4>>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-  // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i64 0, i64 0, i64 0, i64 1
+  // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i32 0, i64 0, i64 0, i32 1
   // CHECK: insertvalue {[[descriptorType:.*]]} { ptr undef, i64 2, i32 20240719, i8 2, i8 40, i8 0, i8 0
   // CHECK-SAME: [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 2, i64 4], [3 x i64] [i64 1, i64 3, i64 8]] }
   // CHECK-SAME: ptr %[[addr]], 0
diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
index 1645e1a407ad4..e517b1352ff5c 100644
--- a/flang/test/Fir/omp-reduction-embox-codegen.fir
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -23,14 +23,14 @@ omp.declare_reduction @test_reduction : !fir.ref<!fir.box<i32>> init {
   omp.yield(%0 : !fir.ref<!fir.box<i32>>)
 }
 
-func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+func.func @_QQmain()  -> !fir.ref<!fir.box<i32>> attributes {fir.bindc_name = "reduce"} {
   %4 = fir.alloca !fir.box<i32>
   omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
     omp.terminator
   }
-  return
+  return %4: !fir.ref<!fir.box<i32>>
 }
 
 // basically we are testing that there isn't a crash
-// CHECK-LABEL: define void @_QQmain
+// CHECK-LABEL: define ptr @_QQmain
 // CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
diff --git a/flang/test/Fir/omp_target_allocmem_freemem.fir b/flang/test/Fir/omp_target_allocmem_freemem.fir
index 03eb94acb1ac7..aa7b2dce07153 100644
--- a/flang/test/Fir/omp_target_allocmem_freemem.fir
+++ b/flang/test/Fir/omp_target_allocmem_freemem.fir
@@ -62,7 +62,7 @@ func.func @omp_target_allocmem_scalar_char_kind() -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar(
 // CHECK-SAME: i32 [[TMP0:%.*]]) {
 // CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
 // CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
@@ -80,7 +80,7 @@ func.func @omp_target_allocmem_scalar_dynchar(%l : i32) -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar_kind(
 // CHECK-SAME: i32 [[TMP0:%.*]]) {
 // CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 2, [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
 // CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
@@ -141,7 +141,7 @@ func.func @omp_target_allocmem_array_of_dynchar(%l: i32) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 12, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 12
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
@@ -157,7 +157,7 @@ func.func @omp_target_allocmem_dynarray_of_nonchar(%e: index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar2(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 4, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 4
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
@@ -174,7 +174,7 @@ func.func @omp_target_allocmem_dynarray_of_nonchar2(%e: index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 60, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 60
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
@@ -191,7 +191,7 @@ func.func @omp_target_allocmem_dynarray_of_char(%e : index) -> () {
 
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char2(
 // CHECK-SAME: i64 [[TMP0:%.*]]) {
-// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 20, [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 20
 // CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]]
 // CHECK-NEXT:    [[TMP4:%.*]] = mul i64 1, [[TMP3]]
 // CHECK-NEXT:    [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0)
@@ -227,7 +227,7 @@ func.func @omp_target_allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> () {
 // CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_dynchar2(
 // CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) {
 // CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP4:%.*]] = mul i64 2, [[TMP3]]
+// CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 // CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP1]]
 // CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP1]]
 // CHECK-NEXT:    [[TMP7:%.*]] = mul i64 1, [[TMP6]]
diff --git a/flang/test/Fir/optional.fir b/flang/test/Fir/optional.fir
index bded8b5332a30..66ff69f083467 100644
--- a/flang/test/Fir/optional.fir
+++ b/flang/test/Fir/optional.fir
@@ -37,8 +37,7 @@ func.func @bar2() -> i1 {
 
 // CHECK-LABEL: @foo3
 func.func @foo3(%arg0: !fir.boxchar<1>) -> i1 {
-  // CHECK: %[[extract:.*]] = extractvalue { ptr, i64 } %{{.*}}, 0
-  // CHECK: %[[ptr:.*]] = ptrtoint ptr %[[extract]] to i64
+  // CHECK: %[[ptr:.*]] = ptrtoint ptr %0 to i64
   // CHECK: icmp ne i64 %[[ptr]], 0
   %0 = fir.is_present %arg0 : (!fir.boxchar<1>) -> i1
   return %0 : i1
diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir
index a200cd7e7cc03..411927aae6bdf 100644
--- a/flang/test/Fir/pdt.fir
+++ b/flang/test/Fir/pdt.fir
@@ -96,13 +96,13 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 {
 
 func.func private @bar(!fir.ref<!fir.char<1,?>>)
 
-// CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1)
-func.func @_QPfoo(%arg0 : i32, %arg1 : i32) {
+// CHECK-LABEL: define ptr @_QPfoo(i32 %0, i32 %1)
+func.func @_QPfoo(%arg0 : i32, %arg1 : i32) -> !fir.ref<!fir.type<_QTt1>> {
   // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1)
   // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]]
   %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32)
   //%2 = fir.coordinate_of %0, f2 : (!fir.ref<!fir.type<_QTt1>>) -> !fir.ref<!fir.char<1,?>>
   %2 = fir.zero_bits !fir.ref<!fir.char<1,?>>
   fir.call @bar(%2) : (!fir.ref<!fir.char<1,?>>) -> ()
-  return
+  return %0 : !fir.ref<!fir.type<_QTt1>>
 }
diff --git a/flang/test/Fir/rebox.fir b/flang/test/Fir/rebox.fir
index 0c9f6d9bb94ad..d858adfb7c45d 100644
--- a/flang/test/Fir/rebox.fir
+++ b/flang/test/Fir/rebox.fir
@@ -36,7 +36,7 @@ func.func @test_rebox_1(%arg0: !fir.box<!fir.array<?x?xf32>>) {
   // CHECK: %[[VOIDBASE0:.*]] = getelementptr i8, ptr %[[INBASE]], i64 %[[OFFSET_0]]
   // CHECK: %[[OFFSET_1:.*]] = mul i64 2, %[[INSTRIDE_1]]
   // CHECK: %[[VOIDBASE1:.*]] = getelementptr i8, ptr %[[VOIDBASE0]], i64 %[[OFFSET_1]]
-  // CHECK: %[[OUTSTRIDE0:.*]] = mul i64 3, %[[INSTRIDE_1]]
+  // CHECK: %[[OUTSTRIDE0:.*]] = mul i64 %[[INSTRIDE_1]], 3
   // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[OUTSTRIDE0]], 7, 0, 2
   // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX1]], ptr %[[VOIDBASE1]], 0
   // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[OUTBOX2]], ptr %[[OUTBOX_ALLOC]], align 8
@@ -63,7 +63,7 @@ func.func @test_rebox_2(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
   // CHECK: %[[OUTBOX:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }
   // CHECK: %[[LEN_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 1
   // CHECK: %[[LEN:.*]] = load i64, ptr %[[LEN_GEP]]
-  // CHECK: %[[SIZE:.*]] = mul i64 1, %[[LEN]]
+  // CHECK: %[[SIZE:.*]] = mul i64 %[[LEN]], 1
   // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } undef, i64 %[[SIZE]], 1
 
   %1 = fir.rebox %arg0 [%0]  : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
@@ -94,8 +94,8 @@ func.func @test_rebox_3(%arg0: !fir.box<!fir.array<?xf32>>) {
   // CHECK: %[[INSTRIDE:.*]] = load i64, ptr %[[INSTRIDE_GEP]]
   // CHECK: %[[INBASE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[INBASE:.*]] = load ptr, ptr %[[INBASE_GEP]]
-  // CHECK: %[[OUTSTRIDE1:.*]] = mul i64 3, %[[INSTRIDE]]
-  // CHECK: %[[OUTSTRIDE2:.*]] = mul i64 4, %[[OUTSTRIDE1]]
+  // CHECK: %[[OUTSTRIDE1:.*]] = mul i64 %[[INSTRIDE]], 3
+  // CHECK: %[[OUTSTRIDE2:.*]] = mul i64 %[[OUTSTRIDE1]], 4
   // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %{{.*}}, i64 %[[INSTRIDE]], 7, 0, 2
   // CHECK: %[[OUTBOX1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX0]], i64 3, 7, 1, 0
   // CHECK: %[[OUTBOX2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [3 x [3 x i64]] } %[[OUTBOX1]], i64 4, 7, 1, 1
@@ -153,13 +153,13 @@ func.func @test_cmplx_1(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   %0:3 = fir.box_dims %arg0, %c0 : (!fir.box<!fir.array<?xcomplex<f32>>>, index) -> (index, index, index)
   %1 = fir.slice %c1, %0#1, %c1 path %c1_i32 : (index, index, index, i32) -> !fir.slice<1>
   %2 = fir.rebox %arg0 [%1] : (!fir.box<!fir.array<?xcomplex<f32>>>, !fir.slice<1>) -> !fir.box<!fir.array<?xf32>>
-  // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i64 0, i32 1
+  // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 1
   // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]]
   // CHECK: %[[INSTRIDE_1_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 2
   // CHECK: %[[INSTRIDE_1:.*]] = load i64, ptr %[[INSTRIDE_1_GEP]]
   // CHECK: %[[FRONT_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[FRONT_PTR:.*]] = load ptr, ptr %[[FRONT_GEP]]
-  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i64 0, i32 0
+  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i32 0, i32 0
   // CHECK: %[[FRONT_OFFSET:.*]] = mul i64 0, %[[INSTRIDE_1]]
   // CHECK: %[[OFFSET_GEP:.*]] = getelementptr i8, ptr %[[FIELD_OFFSET_GEP]], i64 %[[FRONT_OFFSET]]
   // CHECK: %[[SUB_1:.*]] = sub i64 %[[INSTRIDE_0]], 1
@@ -167,7 +167,7 @@ func.func @test_cmplx_1(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   // CHECK: %[[DIV_1:.*]] = sdiv i64 %[[ADD_1]], 1
   // CHECK: %[[CHECK_NONZERO:.*]] = icmp sgt i64 %[[DIV_1]], 0
   // CHECK: %[[CHECKED_BOUND:.*]] = select i1 %[[CHECK_NONZERO]], i64 %[[DIV_1]], i64 0
-  // CHECK: %[[STRIDE:.*]] = mul i64 1, %[[INSTRIDE_1]]
+  // CHECK: %[[STRIDE:.*]] = mul i64 %[[INSTRIDE_1]], 1
   // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[CHECKED_BOUND]], 7, 0, 1
   // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], i64 %[[STRIDE]], 7, 0, 2
   // CHECK: %[[VAL_BUILD_3:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OFFSET_GEP]], 0
@@ -198,10 +198,10 @@ func.func @test_cmplx_2(%arg0: !fir.box<!fir.array<?xcomplex<f32>>>) {
   // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]]
   // CHECK: %[[FRONT_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 0
   // CHECK: %[[FRONT_PTR:.*]] = load ptr, ptr %[[FRONT_GEP]]
-  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i64 0, i32 1
+  // CHECK: %[[FIELD_OFFSET_GEP:.*]] = getelementptr { float, float }, ptr %[[FRONT_PTR]], i32 0, i32 1
   // CHECK: %[[FRONT_OFFSET:.*]] = mul i64 6, %[[INSTRIDE_0]]
   // CHECK: %[[OFFSET_GEP:.*]] = getelementptr i8, ptr %[[FIELD_OFFSET_GEP]], i64 %[[FRONT_OFFSET]]
-  // CHECK: %[[STRIDE:.*]] = mul i64 5, %[[INSTRIDE_0]]
+  // CHECK: %[[STRIDE:.*]] = mul i64 %[[INSTRIDE_0]], 5
   // CHECK: %[[VAL_BUILD_1:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, i64 %[[STRIDE]], 7, 0, 2
   // CHECK: %[[VAL_BUILD_2:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_1]], ptr %[[OFFSET_GEP]], 0
   // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL_BUILD_2]], ptr %[[OUTBOX_ALLOC]]
diff --git a/flang/test/Fir/select.fir b/flang/test/Fir/select.fir
index 5e88048446407..6d843e824d33f 100644
--- a/flang/test/Fir/select.fir
+++ b/flang/test/Fir/select.fir
@@ -64,6 +64,6 @@ func.func @h(%a : i32) -> i32 {
    return %1 : i32
 ^bb6:
    %x = arith.addi %b4, %b3 : i32
-   // CHECK: ret i32
+   // CHECK-DAG: ret i32
    return %x : i32
 }
diff --git a/flang/test/Fir/target.fir b/flang/test/Fir/target.fir
index b04e23a018e7e..1e721a09c835e 100644
--- a/flang/test/Fir/target.fir
+++ b/flang/test/Fir/target.fir
@@ -97,10 +97,6 @@ func.func @call8() {
 // X64-LABEL: define i64 @char1lensum(ptr {{[^%]*}}%0, ptr {{[^%]*}}%1, i64 %2, i64 %3)
 // PPC-LABEL: define i64 @char1lensum(ptr {{[^%]*}}%0, ptr {{[^%]*}}%1, i64 %2, i64 %3)
 func.func @char1lensum(%arg0 : !fir.boxchar<1>, %arg1 : !fir.boxchar<1>) -> i64 {
-  // X64-DAG: %[[p0:.*]] = insertvalue { ptr, i64 } undef, ptr %1, 0
-  // X64-DAG: = insertvalue { ptr, i64 } %[[p0]], i64 %3, 1
-  // X64-DAG: %[[p1:.*]] = insertvalue { ptr, i64 } undef, ptr %0, 0
-  // X64-DAG: = insertvalue { ptr, i64 } %[[p1]], i64 %2, 1
   %1:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1>>, i64)
   %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1>>, i64)
   // I32: %[[add:.*]] = add i64 %
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index 4907aa03ec5a5..072c8bbe4e80c 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -62,9 +62,9 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK-LABEL: define void @_QPfunc(
 // CHECK-SAME:      ptr {{[^%]*}}%[[ARG0:.*]]){{.*}}{
 // [...]
-// CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 7, i32 0, i32 0
+// CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 6
 // box access:
-// CHECK:  %[[VAL6:.*]] = load i64, ptr %[[VAL5]], align 4, !tbaa ![[BOX_ACCESS_TAG:.*]]
+// CHECK:  %[[VAL6:.*]] = load i8, ptr %[[VAL5]], align 1, !tbaa ![[BOX_ACCESS_TAG:.*]]
 // CHECK:  %[[VAL7:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i32 0, i32 7, i32 0, i32 1
 // box access:
 // CHECK:  %[[VAL8:.*]] = load i64, ptr %[[VAL7]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
@@ -76,15 +76,9 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK:  %[[VAL12:.*]] = load ptr, ptr %[[VAL11]], align 8, !tbaa ![[BOX_ACCESS_TAG]]
 // CHECK:  %[[VAL15:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %[[VAL12]], 0
 // CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL15]], ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
-// CHECK:  %[[VAL16:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 0
-// box access:
-// CHECK:  %[[VAL17:.*]] = load i64, ptr %[[VAL16]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
-// CHECK:  %[[VAL18:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 1
+// CHECK:  %[[VAL18:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i32 0, i32 1
 // box access:
 // CHECK:  %[[VAL19:.*]] = load i64, ptr %[[VAL18]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
-// CHECK:  %[[VAL20:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 2
-// box access:
-// CHECK:  %[[VAL21:.*]] = load i64, ptr %[[VAL20]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
 // [...]
 // box access:
 // CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
diff --git a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
index 665be5a8db4d4..5ce36ac87ca8c 100644
--- a/flang/test/Integration/OpenMP/map-types-and-sizes.f90
+++ b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
@@ -545,7 +545,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCATABLE_DESC_ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 !CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_derived_type_allocaTone_layer, i64 1, align 8
 !CHECK: %[[MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_derived_type_allocaTone_layer, ptr %[[ALLOCA]], i32 0, i32 4
-!CHECK: %[[DESC_BOUND_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_DESC_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[DESC_BOUND_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_DESC_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[DESC_BOUND_ACCESS_LOAD:.*]] = load i64, ptr %[[DESC_BOUND_ACCESS]], align 8
 !CHECK: %[[OFFSET_UB:.*]] = sub i64 %[[DESC_BOUND_ACCESS_LOAD]], 1
 !CHECK: %[[MEMBER_DESCRIPTOR_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[MEMBER_ACCESS]], i32 0, i32 0
@@ -596,7 +596,7 @@ end subroutine mapType_common_block_members
 !CHECK: %{{.*}} = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 0
 !CHECK: %{{.*}} = load ptr, ptr %{{.*}}, align 8
 !CHECK: %{{.*}} = getelementptr %_QFmaptype_alloca_derived_typeTone_layer, ptr %{{.*}}, i32 0, i32 4
-!CHECK: %[[ACCESS_DESC_MEMBER_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[DTYPE_ARRAY_MEMBER_DESC_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[ACCESS_DESC_MEMBER_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[DTYPE_ARRAY_MEMBER_DESC_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[LOAD_DESC_MEMBER_UB:.*]] = load i64, ptr %[[ACCESS_DESC_MEMBER_UB]], align 8
 !CHECK: %[[OFFSET_MEMBER_UB:.*]] = sub i64 %[[LOAD_DESC_MEMBER_UB]], 1
 !CHECK: %[[DTYPE_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[DTYPE_DESC_ALLOCA_2]], i32 0, i32 0
@@ -665,7 +665,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
 !CHECK: %[[DTYPE_DESC_ALLOCA_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, align 8
 !CHECK: %[[DTYPE_DESC_ALLOCA_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1, align 8
-!CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[ALLOCATABLE_MEMBER_ALLOCA_UB_LOAD:.*]] = load i64, ptr %[[ALLOCATABLE_MEMBER_ALLOCA_UB]], align 8
 !CHECK: %[[ALLOCATABLE_MEMBER_SIZE_CALC_1:.*]] = sub i64 %[[ALLOCATABLE_MEMBER_ALLOCA_UB_LOAD]], 1
 !CHECK: %[[DTYPE_DESC_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[DTYPE_DESC_ALLOCA_2]], i32 0, i32 0
@@ -734,7 +734,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCA:.*]] = alloca %_QFmaptype_nested_derived_type_allocaTtop_layer, i64 1, align 8
 !CHECK: %[[NESTED_DTYPE_MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_nested_derived_type_allocaTtop_layer, ptr %[[ALLOCA]], i32 0, i32 6
 !CHECK: %[[NESTED_MEMBER_ACCESS:.*]] = getelementptr %_QFmaptype_nested_derived_type_allocaTmiddle_layer, ptr %[[NESTED_DTYPE_MEMBER_ACCESS]], i32 0, i32 2
-!CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCATABLE_MEMBER_ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[ALLOCATABLE_MEMBER_ADDR_LOAD:.*]] = load i64, ptr %[[ALLOCATABLE_MEMBER_BASE_ADDR]], align 8
 !CHECK: %[[ALLOCATABLE_MEMBER_SIZE_CALC_1:.*]] = sub i64 %[[ALLOCATABLE_MEMBER_ADDR_LOAD]], 1
 !CHECK: %[[NESTED_MEMBER_BASE_ADDR_ACCESS:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 0
@@ -778,9 +778,9 @@ end subroutine mapType_common_block_members
 !CHECK: %[[ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, align 8
 !CHECK: %[[BASE_PTR_1:.*]] = alloca %_QFmaptype_nested_derived_type_member_idxTdtype, i64 1, align 8
 !CHECK: %[[OFF_PTR_1:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTdtype, ptr %[[BASE_PTR_1]], i32 0, i32 1
-!CHECK: %[[BOUNDS_ACC:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[BOUNDS_ACC:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[BOUNDS_LD:.*]] = load i64, ptr %[[BOUNDS_ACC]], align 8
-!CHECK: %[[BOUNDS_ACC_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCA_1]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[BOUNDS_ACC_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ALLOCA_1]], i32 0, i32 7, i32 0, i32 1
 !CHECK: %[[BOUNDS_LD_2:.*]] = load i64, ptr %[[BOUNDS_ACC_2]], align 8
 !CHECK: %[[BOUNDS_CALC:.*]] = sub i64 %[[BOUNDS_LD_2]], 1
 !CHECK: %[[OFF_PTR_CALC_0:.*]] = sub i64 %[[BOUNDS_LD]], 1
@@ -789,7 +789,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[LOAD_DESC_PTR:.*]] = load ptr, ptr %[[GEP_DESC_PTR]], align 8
 !CHECK: %[[SZ_CALC_1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_0]], i32 0, i32 7, i32 0, i32 2
 !CHECK: %[[SZ_CALC_2:.*]] = load i64, ptr %[[SZ_CALC_1]], align 8
-!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 1, %[[SZ_CALC_2]]
+!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 %[[SZ_CALC_2]], 1
 !CHECK: %[[SZ_CALC_4:.*]] = add nsw i64 %[[SZ_CALC_3]], 0
 !CHECK: %[[SZ_CALC_5:.*]] = getelementptr i8, ptr %[[LOAD_DESC_PTR]], i64 %[[SZ_CALC_4]]
 !CHECK: %[[SZ_CALC_6:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTvertexes, ptr %[[SZ_CALC_5]], i32 0, i32 2
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index e6a8c5e025123..5a28e97054359 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -606,8 +606,6 @@ program test_alloc
 ! LLVM-COUNT-2:  call void %{{[0-9]*}}()
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
-! LLVM: %[[GEP_TDESC_C1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
-! LLVM: %[[TDESC_C1:.*]] = load ptr, ptr %[[GEP_TDESC_C1]]
 ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
@@ -620,8 +618,6 @@ program test_alloc
 ! LLVM: call void %{{.*}}(ptr %{{.*}}) 
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
-! LLVM: %[[GEP_TDESC_C2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7
-! LLVM: %[[TDESC_C2:.*]] = load ptr, ptr %[[GEP_TDESC_C2]]
 ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1
 ! LLVM: %[[ELEM_SIZE:.*]] = load i64, ptr %[[ELEM_SIZE_GEP]]
 ! LLVM: %[[TYPE_CODE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 4
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index d1e12a8dbdfec..7a1f4b125a79f 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -23,11 +23,11 @@ end program test
 
 ! CHECK-LABEL: define internal void @_QFPsub(
 ! CHECK-SAME:    ptr {{[^%]*}}%[[arg:.*]])
-! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i64 0, i32 1
+! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i32 0, i32 1
 ! CHECK: %[[extval:.*]] = load i64, ptr %[[extent]]
 ! CHECK: %[[elesize:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 1
 ! CHECK: %[[esval:.*]] = load i64, ptr %[[elesize]]
-! CHECK: %[[mul:.*]] = mul i64 1, %[[esval]]
+! CHECK: %[[mul:.*]] = mul i64 %[[esval]], 1
 ! CHECK: %[[mul2:.*]] = mul i64 %[[mul]], %[[extval]]
 ! CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
 ! CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
new file mode 100644
index 0000000000000..af6dfb0057688
--- /dev/null
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h
@@ -0,0 +1,23 @@
+//===- OpenMPOffloadPrivatizationPrepare.h - Prepare for OpenMP Offload
+// Privatization -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
+#define MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+namespace LLVM {
+#define GEN_PASS_DECL_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS_H
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
index 961909d5c8d27..1ba67caba05be 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
@@ -73,4 +73,16 @@ def DIScopeForLLVMFuncOpPass : Pass<"ensure-debug-info-scope-on-llvm-func", "::m
   ];
 }
 
+def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prepare", "::mlir::LLVM::LLVMFuncOp"> {
+    let summary = "Prepare OpenMP maps for privatization for deferred target tasks";
+    let description = [{
+      When generating LLVMIR for privatized variables in an OpenMP offloading directive (eg. omp::TargetOp)
+      that creates a deferred target task (when the nowait clause is used), we need to copy the privatized
+      variable out of the stack of the generating task and into the heap so that the deferred target task
+      can still access it. However, if such a privatized variable is also mapped, typically the case for
+      allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated
+      variable and not the original variable.
+    }];
+  let dependentDialects = ["LLVM::LLVMDialect", "mlir::omp::OpenMPDialect"];
+}
 #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 2548a8ab4aac6..efa43107da068 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1479,8 +1479,8 @@ def TargetOp : OpenMP_Op<"target", traits = [
     `map` operands. For `private` operands that require a map, the value of the
     corresponding element in the attribute is the index of the `map` operand
     (relative to other `map` operands not the whole operands of the operation). For
-    `private` opernads that do not require a map, this value is -1 (which is omitted
-    from the assembly foramt printing).
+    `private` operands that do not require a map, this value is -1 (which is omitted
+    from the assembly format printing).
   }] # clausesDescription;
 
   let arguments = !con(clausesArgs,
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index d4ff0955c5d0e..729f5191cd557 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   LegalizeForExport.cpp
   OptimizeForNVVM.cpp
   RequestCWrappers.cpp
+  OpenMPOffloadPrivatizationPrepare.cpp
 
   DEPENDS
   MLIRLLVMPassIncGen
@@ -18,4 +19,5 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   MLIRPass
   MLIRTransforms
   MLIRNVVMDialect
+  MLIROpenMPDialect
   )
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
new file mode 100644
index 0000000000000..a2e522d5f536d
--- /dev/null
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -0,0 +1,423 @@
+//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare for OpenMP Offload
+// Privatization ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/Transforms/OpenMPOffloadPrivatizationPrepare.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <cstdint>
+#include <utility>
+
+//===----------------------------------------------------------------------===//
+// A pass that prepares OpenMP code for translation of delayed privatization
+// in the context of deferred target tasks. Deferred target tasks are created
+// when the nowait clause is used on the target directive.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "omp-prepare-for-offload-privatization"
+#define PDBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "]: ")
+
+namespace mlir {
+namespace LLVM {
+
+#define GEN_PASS_DEF_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
+
+} // namespace LLVM
+} // namespace mlir
+
+using namespace mlir;
+namespace {
+
+//===----------------------------------------------------------------------===//
+// OMPTargetPrepareDelayedPrivatizationPattern
+//===----------------------------------------------------------------------===//
+
+class OMPTargetPrepareDelayedPrivatizationPattern
+    : public OpRewritePattern<omp::TargetOp> {
+public:
+  using OpRewritePattern<omp::TargetOp>::OpRewritePattern;
+
+  // Match omp::TargetOp that have the following characteristics.
+  // 1. have private vars which refer to local (stack) memory
+  // 2. the target op has the nowait clause
+  // In this case, we allocate memory for the privatized variable on the heap
+  // and copy the original variable into this new heap allocation. We fix up
+  // any omp::MapInfoOp instances that may be mapping the private variable.
+  mlir::LogicalResult
+  matchAndRewrite(omp::TargetOp targetOp,
+                  PatternRewriter &rewriter) const override {
+    if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp))
+      return rewriter.notifyMatchFailure(
+          targetOp,
+          "targetOp does not have privateVars or does not need a target task");
+
+    ModuleOp mod = targetOp->getParentOfType<ModuleOp>();
+    LLVM::LLVMFuncOp llvmFunc = targetOp->getParentOfType<LLVM::LLVMFuncOp>();
+    OperandRange privateVars = targetOp.getPrivateVars();
+    mlir::SmallVector<mlir::Value> newPrivVars;
+
+    newPrivVars.reserve(privateVars.size());
+    std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
+    for (auto [privVarIdx, privVarSymPair] :
+         llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
+      auto privVar = std::get<0>(privVarSymPair);
+      auto privSym = std::get<1>(privVarSymPair);
+
+      omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym);
+      if (!privatizer.needsMap()) {
+        newPrivVars.push_back(privVar);
+        continue;
+      }
+      bool isFirstPrivate = privatizer.getDataSharingType() ==
+                            omp::DataSharingClauseType::FirstPrivate;
+
+      mlir::Value mappedValue =
+          targetOp.getMappedValueForPrivateVar(privVarIdx);
+      Operation *mapInfoOperation = mappedValue.getDefiningOp();
+      auto mapInfoOp = mlir::cast<omp::MapInfoOp>(mapInfoOperation);
+
+      if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) {
+        newPrivVars.push_back(privVar);
+        continue;
+      }
+
+      // Allocate heap memory that corresponds to the type of memory
+      // pointed to by varPtr
+      // TODO: For boxchars this likely wont be a pointer.
+      mlir::Value varPtr = privVar;
+      mlir::Value heapMem = allocateHeapMem(targetOp, privVar, mod, rewriter);
+      if (!heapMem)
+        return failure();
+
+      newPrivVars.push_back(heapMem);
+
+      // Find the earliest insertion point for the copy. This will be before
+      // the first in the list of omp::MapInfoOp instances that use varPtr.
+      // After the copy these omp::MapInfoOp instances will refer to heapMem
+      // instead.
+      Operation *varPtrDefiningOp = varPtr.getDefiningOp();
+      std::set<Operation *> users;
+      users.insert(varPtrDefiningOp->user_begin(),
+                   varPtrDefiningOp->user_end());
+
+      auto usesVarPtr = [&users](Operation *op) -> bool {
+        return users.count(op);
+      };
+      SmallVector<Operation *> chainOfOps;
+      chainOfOps.push_back(mapInfoOperation);
+      if (!mapInfoOp.getMembers().empty()) {
+        for (auto member : mapInfoOp.getMembers()) {
+          if (usesVarPtr(member.getDefiningOp()))
+            chainOfOps.push_back(member.getDefiningOp());
+
+          omp::MapInfoOp memberMap =
+              mlir::cast<omp::MapInfoOp>(member.getDefiningOp());
+          if (memberMap.getVarPtrPtr() &&
+              usesVarPtr(memberMap.getVarPtrPtr().getDefiningOp()))
+            chainOfOps.push_back(memberMap.getVarPtrPtr().getDefiningOp());
+        }
+      }
+      DominanceInfo dom;
+      llvm::sort(chainOfOps, [&](Operation *l, Operation *r) {
+        return dom.dominates(l, r);
+      });
+
+      rewriter.setInsertionPoint(chainOfOps.front());
+      // Copy the value of the local variable into the heap-allocated location.
+      mlir::Location loc = chainOfOps.front()->getLoc();
+      mlir::Type varType = getElemType(varPtr);
+      auto loadVal = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
+      LLVM_ATTRIBUTE_UNUSED auto storeInst =
+          rewriter.create<LLVM::StoreOp>(loc, loadVal.getResult(), heapMem);
+
+      using ReplacementEntry = std::pair<Operation *, Operation *>;
+      llvm::SmallVector<ReplacementEntry> replRecord;
+      auto cloneAndMarkForDeletion = [&](Operation *origOp) -> Operation * {
+        Operation *clonedOp = rewriter.clone(*origOp);
+        rewriter.replaceAllOpUsesWith(origOp, clonedOp);
+        replRecord.push_back(std::make_pair(origOp, clonedOp));
+        return clonedOp;
+      };
+
+      rewriter.setInsertionPoint(targetOp);
+      rewriter.setInsertionPoint(cloneAndMarkForDeletion(mapInfoOperation));
+
+      // Fix any members that may use varPtr to now use heapMem
+      if (!mapInfoOp.getMembers().empty()) {
+        for (auto member : mapInfoOp.getMembers()) {
+          Operation *memberOperation = member.getDefiningOp();
+          if (!usesVarPtr(memberOperation))
+            continue;
+          rewriter.setInsertionPoint(cloneAndMarkForDeletion(memberOperation));
+
+          auto memberMapInfoOp = mlir::cast<omp::MapInfoOp>(memberOperation);
+          if (memberMapInfoOp.getVarPtrPtr()) {
+            Operation *varPtrPtrdefOp =
+                memberMapInfoOp.getVarPtrPtr().getDefiningOp();
+
+            // In the case of firstprivate, we have to do the following
+            // 1. Allocate heap memory for the underlying data.
+            // 2. Copy the original underlying data to the new memory allocated
+            // on the heap.
+            // 3. Put this new (heap) address in the originating
+            // struct/descriptor
+
+            // Consider the following sequence of omp.map.info and omp.target
+            // operations.
+            // %0 = llvm.getelementptr %19[0, 0]
+            // %1 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) ...
+            //                   var_ptr_ptr(%0 : !llvm.ptr)  bounds(..)
+            // %2 = omp.map.info var_ptr(%19 : !llvm.ptr, !desc_type)>) ...
+            //                   members(%1 : [0] : !llvm.ptr) -> !llvm.ptr
+            // omp.target nowait map_entries(%2 -> %arg5, %1 -> %arg8 : ..)
+            //                   private(@privatizer %19 -> %arg9 [map_idx=1] :
+            //                   !llvm.ptr) {
+            // We need to allocate memory on the heap for the underlying pointer
+            // which is stored at the var_ptr_ptr operand of %1. Then we need to
+            // copy this pointer to the new heap allocated memory location.
+            // Then, we need to store the address of the new heap location in
+            // the originating struct/descriptor. So, we generate the following
+            // (pseudo) MLIR code (Using the same names of mlir::Value instances
+            // in the example as in the code below)
+            //
+            // %dataMalloc = malloc(totalSize)
+            // %loadDataPtr = load %0 : !llvm.ptr -> !llvm.ptr
+            // memcpy(%dataMalloc, %loadDataPtr, totalSize)
+            // %newVarPtrPtrOp = llvm.getelementptr %heapMem[0, 0]
+            // llvm.store %dataMalloc, %newVarPtrPtrOp
+            // %1.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr, i32) ...
+            //                          var_ptr_ptr(%newVarPtrPtrOp : !llvm.ptr)
+            // %2.cloned = omp.map.info var_ptr(%heapMem : !llvm.ptr,
+            //                                             !desc_type)>) ...
+            //                          members(%1.cloned : [0] : !llvm.ptr)
+            //             -> !llvm.ptr
+            // omp.target nowait map_entries(%2.cloned -> %arg5,
+            //                               %1.cloned -> %arg8 : ..)
+            //            private(@privatizer %heapMem -> .. [map_idx=1] : ..) {
+
+            if (isFirstPrivate) {
+              assert(!memberMapInfoOp.getBounds().empty() &&
+                     "empty bounds on member map of firstprivate variable");
+              mlir::Location loc = memberMapInfoOp.getLoc();
+              mlir::Value totalSize =
+                  getSizeInBytes(memberMapInfoOp, mod, rewriter);
+              auto dataMalloc = allocateHeapMem(loc, totalSize, mod, rewriter);
+              auto loadDataPtr = rewriter.create<LLVM::LoadOp>(
+                  loc, memberMapInfoOp.getVarPtrPtr().getType(),
+                  memberMapInfoOp.getVarPtrPtr());
+              LLVM_ATTRIBUTE_UNUSED auto memcpy =
+                  rewriter.create<mlir::LLVM::MemcpyOp>(
+                      loc, dataMalloc.getResult(), loadDataPtr.getResult(),
+                      totalSize, /*isVolatile=*/false);
+              Operation *newVarPtrPtrOp = rewriter.clone(*varPtrPtrdefOp);
+              rewriter.replaceAllUsesExcept(memberMapInfoOp.getVarPtrPtr(),
+                                            newVarPtrPtrOp->getOpResult(0),
+                                            loadDataPtr);
+              rewriter.modifyOpInPlace(newVarPtrPtrOp, [&]() {
+                newVarPtrPtrOp->replaceUsesOfWith(varPtr, heapMem);
+              });
+              LLVM_ATTRIBUTE_UNUSED auto storePtr =
+                  rewriter.create<LLVM::StoreOp>(loc, dataMalloc.getResult(),
+                                                 newVarPtrPtrOp->getResult(0));
+            } else
+              rewriter.setInsertionPoint(
+                  cloneAndMarkForDeletion(varPtrPtrdefOp));
+          }
+        }
+      }
+
+      for (auto repl : replRecord) {
+        Operation *origOp = repl.first;
+        Operation *clonedOp = repl.second;
+        rewriter.modifyOpInPlace(
+            clonedOp, [&]() { clonedOp->replaceUsesOfWith(varPtr, heapMem); });
+        rewriter.eraseOp(origOp);
+      }
+    }
+    assert(newPrivVars.size() == privateVars.size() &&
+           "The number of private variables must match before and after "
+           "transformation");
+
+    rewriter.setInsertionPoint(targetOp);
+    Operation *newOp = rewriter.clone(*targetOp.getOperation());
+    omp::TargetOp newTargetOp = mlir::cast<omp::TargetOp>(newOp);
+    rewriter.modifyOpInPlace(newTargetOp, [&]() {
+      newTargetOp.getPrivateVarsMutable().assign(newPrivVars);
+    });
+    rewriter.replaceOp(targetOp, newTargetOp);
+    return mlir::success();
+  }
+
+private:
+  bool hasPrivateVars(omp::TargetOp targetOp) const {
+    return !targetOp.getPrivateVars().empty();
+  }
+
+  bool isTargetTaskDeferred(omp::TargetOp targetOp) const {
+    return targetOp.getNowait();
+  }
+
+  template <typename OpTy>
+  omp::PrivateClauseOp findPrivatizer(OpTy op, mlir::Attribute privSym) const {
+    SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
+    omp::PrivateClauseOp privatizer =
+        SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+            op, privatizerName);
+    return privatizer;
+  }
+
+  template <typename OpType>
+  mlir::Type getElemType(OpType op) const {
+    return op.getElemType();
+  }
+
+  mlir::Type getElemType(mlir::Value varPtr) const {
+    Operation *definingOp = unwrapAddrSpaceCast(varPtr.getDefiningOp());
+    assert((mlir::isa<LLVM::AllocaOp, LLVM::GEPOp>(definingOp)) &&
+           "getElemType in PrepareForOMPOffloadPrivatizationPass can deal only "
+           "with Alloca or GEP for now");
+    if (auto allocaOp = mlir::dyn_cast<LLVM::AllocaOp>(definingOp))
+      return getElemType(allocaOp);
+    // TODO: get rid of this because GEPOp.getElemType() is not the right thing
+    // to use.
+    if (auto gepOp = mlir::dyn_cast<LLVM::GEPOp>(definingOp))
+      return getElemType(gepOp);
+    return mlir::Type{};
+  }
+
+  mlir::Operation *unwrapAddrSpaceCast(Operation *op) const {
+    if (!mlir::isa<LLVM::AddrSpaceCastOp>(op))
+      return op;
+    mlir::LLVM::AddrSpaceCastOp addrSpaceCastOp =
+        mlir::cast<LLVM::AddrSpaceCastOp>(op);
+    return unwrapAddrSpaceCast(addrSpaceCastOp.getArg().getDefiningOp());
+  }
+
+  // Get the (compile-time constant) size of varType as per the
+  // given DataLayout dl.
+  std::int64_t getSizeInBytes(const mlir::DataLayout &dl,
+                              mlir::Type varType) const {
+    llvm::TypeSize size = dl.getTypeSize(varType);
+    unsigned short alignment = dl.getTypeABIAlignment(varType);
+    return llvm::alignTo(size, alignment);
+  }
+
+  // Generate code to get the size of data being mapped from the bounds
+  // of mapInfoOp
+  mlir::Value getSizeInBytes(omp::MapInfoOp mapInfoOp, ModuleOp mod,
+                             PatternRewriter &rewriter) const {
+    mlir::Location loc = mapInfoOp.getLoc();
+    mlir::Type llvmInt64Ty = rewriter.getI64Type();
+    mlir::Value constOne =
+        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, 1);
+    mlir::Value elementCount = constOne;
+    // TODO: Consider using  boundsOp.getExtent() if available.
+    for (auto bounds : mapInfoOp.getBounds()) {
+      auto boundsOp = mlir::cast<omp::MapBoundsOp>(bounds.getDefiningOp());
+      elementCount = rewriter.create<LLVM::MulOp>(
+          loc, llvmInt64Ty, elementCount,
+          rewriter.create<LLVM::AddOp>(
+              loc, llvmInt64Ty,
+              (rewriter.create<LLVM::SubOp>(loc, llvmInt64Ty,
+                                            boundsOp.getUpperBound(),
+                                            boundsOp.getLowerBound())),
+              constOne));
+    }
+    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    std::int64_t elemSize = getSizeInBytes(dl, mapInfoOp.getVarType());
+    mlir::Value elemSizeV =
+        rewriter.create<LLVM::ConstantOp>(loc, llvmInt64Ty, elemSize);
+    return rewriter.create<LLVM::MulOp>(loc, llvmInt64Ty, elementCount,
+                                        elemSizeV);
+  }
+
+  LLVM::LLVMFuncOp getMalloc(ModuleOp mod, PatternRewriter &rewriter) const {
+    llvm::FailureOr<mlir::LLVM::LLVMFuncOp> mallocCall =
+        LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
+    assert(llvm::succeeded(mallocCall) &&
+           "Could not find malloc in the module");
+    return mallocCall.value();
+  }
+
+  template <typename OpTy>
+  mlir::Value allocateHeapMem(OpTy targetOp, mlir::Value privVar, ModuleOp mod,
+                              PatternRewriter &rewriter) const {
+    mlir::Value varPtr = privVar;
+    Operation *definingOp = varPtr.getDefiningOp();
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(definingOp);
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+
+    mlir::Location loc = definingOp->getLoc();
+    mlir::Type varType = getElemType(varPtr);
+    assert(mod.getDataLayoutSpec() &&
+           "MLIR module with no datalayout spec not handled yet");
+    const mlir::DataLayout &dl = mlir::DataLayout(mod);
+    std::int64_t distance = getSizeInBytes(dl, varType);
+    mlir::Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
+        loc, mallocFn.getFunctionType().getParamType(0), distance);
+
+    auto mallocCallOp =
+        rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{sizeBytes});
+    return mallocCallOp.getResult();
+  }
+
+  LLVM::CallOp allocateHeapMem(mlir::Location loc, mlir::Value size,
+                               ModuleOp mod, PatternRewriter &rewriter) const {
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+    return rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{size});
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PrepareForOMPOffloadPrivatizationPass
+//===----------------------------------------------------------------------===//
+
+struct PrepareForOMPOffloadPrivatizationPass
+    : public LLVM::impl::PrepareForOMPOffloadPrivatizationPassBase<
+          PrepareForOMPOffloadPrivatizationPass> {
+
+  void runOnOperation() override {
+    LLVM::LLVMFuncOp func = getOperation();
+    MLIRContext &context = getContext();
+    ModuleOp mod = func->getParentOfType<ModuleOp>();
+
+    // FunctionFilteringPass removes bounds arguments from omp.map.info
+    // operations. We require bounds else our pass asserts. But, that's only for
+    // maps in functions that are on the host. So, skip functions being compiled
+    // for the target.
+    auto offloadModuleInterface =
+        mlir::dyn_cast<omp::OffloadModuleInterface>(mod.getOperation());
+    if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice()) {
+      return;
+    }
+
+    RewritePatternSet patterns(&context);
+    patterns.add<OMPTargetPrepareDelayedPrivatizationPattern>(&context);
+
+    if (mlir::failed(
+            applyPatternsGreedily(func, std::move(patterns),
+                                  GreedyRewriteConfig().setStrictness(
+                                      GreedyRewriteStrictness::ExistingOps)))) {
+      emitError(func.getLoc(),
+                "error in preparing targetOps for delayed privatization.");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6694de8383534..f3cbd62b53342 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -356,14 +356,8 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       result = todo("priority");
   };
   auto checkPrivate = [&todo](auto op, LogicalResult &result) {
-    if constexpr (std::is_same_v<std::decay_t<decltype(op)>, omp::TargetOp>) {
-      // Privatization is supported only for included target tasks.
-      if (!op.getPrivateVars().empty() && op.getNowait())
-        result = todo("privatization for deferred target tasks");
-    } else {
-      if (!op.getPrivateVars().empty() || op.getPrivateSyms())
-        result = todo("privatization");
-    }
+    if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+      result = todo("privatization");
   };
   auto checkReduction = [&todo](auto op, LogicalResult &result) {
     if (isa<omp::TeamsOp>(op))
@@ -450,7 +444,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkDevice(op, result);
         checkInReduction(op, result);
         checkIsDevicePtr(op, result);
-        checkPrivate(op, result);
       })
       .Default([](Operation &) {
         // Assume all clauses for an operation can be translated unless they are
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index de714d8b740af..60c5406bdd197 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -624,6 +624,7 @@ LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
   // We use the thread-pool this context is creating, and avoid
   // creating any thread when disabled.
   MLIRContext threadPoolCtx;
+
   if (threadPoolCtx.isMultithreadingEnabled())
     threadPool = &threadPoolCtx.getThreadPool();
 
diff --git a/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
new file mode 100644
index 0000000000000..6b8121b262f47
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/omp-offload-privatization-prepare.mlir
@@ -0,0 +1,167 @@
+// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
+  llvm.func @free(!llvm.ptr)
+  omp.private {type = private} @privatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+
+  omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+
+  llvm.func internal @private_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+    %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+    %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %0, %21 : i32, !llvm.ptr
+    %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %151 = llvm.load %150 : !llvm.ptr -> i64
+    %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %153 = llvm.load %152 : !llvm.ptr -> i64
+    %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %155 = llvm.load %154 : !llvm.ptr -> i64
+    %156 = llvm.sub %153, %1 : i64
+    %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true}
+    %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""}
+    %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr
+    omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %19 -> %arg9 [map_idx=1] : !llvm.ptr) {
+      omp.terminator
+    }
+    %166 = llvm.mlir.constant(48 : i32) : i32
+    %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr
+    llvm.call @free(%168) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+
+  llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+    %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+    %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %0, %21 : i32, !llvm.ptr
+    %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %151 = llvm.load %150 : !llvm.ptr -> i64
+    %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %153 = llvm.load %152 : !llvm.ptr -> i64
+    %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %155 = llvm.load %154 : !llvm.ptr -> i64
+    %156 = llvm.sub %153, %1 : i64
+    %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true}
+    %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""}
+    %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr
+    omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %19 -> %arg9 [map_idx=1] : !llvm.ptr) {
+      omp.terminator
+    }
+    %166 = llvm.mlir.constant(48 : i32) : i32
+    %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr
+    llvm.call @free(%168) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
+// CHECK-LABEL:   llvm.func @malloc(i64) -> !llvm.ptr
+// CHECK:         llvm.func @free(!llvm.ptr)
+
+// CHECK-LABEL:   llvm.func internal @private_test(
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_1]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.alloca %[[VAL_1]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: llvm.store %[[VAL_0]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_1]], %[[VAL_6]] : i32, !llvm.ptr
+// CHECK: %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_6]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_8:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_9:.*]] = llvm.load %[[VAL_8]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_10:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_10]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_14:.*]] = llvm.sub %[[VAL_11]], %[[VAL_2]] : i64
+// CHECK: %[[VAL_15:.*]] = omp.map.bounds lower_bound(%[[VAL_2]] : i64) upper_bound(%[[VAL_14]] : i64) extent(%[[VAL_11]] : i64) stride(%[[VAL_13]] : i64) start_idx(%[[VAL_9]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_16:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_16]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_18:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_17]] : !llvm.ptr) bounds(%[[VAL_15]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_19:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, to) capture(ByRef) members(%[[VAL_18]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target nowait map_entries(%[[VAL_7]] -> %[[VAL_20:.*]], %[[VAL_19]] -> %[[VAL_21:.*]], %[[VAL_18]] -> %[[VAL_22:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@privatizer %[[HEAP]] -> %[[VAL_23:.*]] [map_idx=1] : !llvm.ptr) {
+// CHECK:   omp.terminator
+// CHECK: }
+// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_25]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK:         }
+
+// CHECK-LABEL:   llvm.func internal @firstprivate_test(
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(4 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_2:.*]] = llvm.mlir.undef :
+// CHECK-SAME: !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_5:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP:.*]] = llvm.call @malloc(%[[VAL_5]]) : (i64) -> !llvm.ptr
+// CHECK: %[[STACK:.*]] = llvm.alloca %[[VAL_3]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_3]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: llvm.store %[[VAL_2]], %[[STACK]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_3]], %[[VAL_8]] : i32, !llvm.ptr
+// CHECK: %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_8]] : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc)
+// CHECK-SAME: capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_10:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 0] : (!llvm.ptr) -> !llvm.ptr,
+// CHECK-SAME: !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_10]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 1] : (!llvm.ptr) ->
+// CHECK-SAME: !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[STACK]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_16:.*]] = llvm.sub %[[VAL_13]], %[[VAL_4]] : i64
+// CHECK: %[[VAL_17:.*]] = omp.map.bounds lower_bound(%[[VAL_4]] : i64) upper_bound(%[[VAL_16]] : i64) extent(%[[VAL_13]] : i64) stride(%[[VAL_15]] : i64) start_idx(%[[VAL_11]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_18:.*]] = llvm.load %[[STACK]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_18]], %[[HEAP]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_20:.*]] = llvm.sub %[[VAL_16]], %[[VAL_4]] : i64
+// CHECK: %[[VAL_21:.*]] = llvm.add %[[VAL_20]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_22:.*]] = llvm.mul %[[VAL_1]], %[[VAL_21]] : i64
+// CHECK: %[[VAL_23:.*]] = llvm.mul %[[VAL_22]], %[[VAL_0]] : i64
+// CHECK: %[[NEW_DATA_PTR:.*]] = llvm.call @malloc(%[[VAL_23]]) : (i64) -> !llvm.ptr
+// CHECK: %[[OLD_DATA_PTR:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> !llvm.ptr
+// CHECK: "llvm.intr.memcpy"(%[[NEW_DATA_PTR]], %[[OLD_DATA_PTR]], %[[VAL_23]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[HEAP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[NEW_DATA_PTR]], %[[VAL_26]] : !llvm.ptr, !llvm.ptr
+// CHECK: %[[VAL_27:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, i32) map_clauses(to) capture(ByRef)
+// CHECK-SAME: var_ptr_ptr(%[[VAL_26]] : !llvm.ptr) bounds(%[[VAL_17]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_28:.*]] = omp.map.info var_ptr(%[[HEAP]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>)
+// CHECK-SAME: map_clauses(always, to) capture(ByRef) members(%[[VAL_27]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target nowait map_entries(%[[VAL_9]] -> %[[VAL_29:.*]], %[[VAL_28]] -> %[[VAL_30:.*]], %[[VAL_27]] -> %[[VAL_31:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr)
+// CHECK-SAME: private(@firstprivatizer %[[HEAP]] -> %[[VAL_32:.*]] [map_idx=1] : !llvm.ptr) {
+// CHECK:   omp.terminator
+// CHECK: }
+// CHECK: %[[VAL_33:.*]] = llvm.getelementptr %[[STACK]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_34:.*]] = llvm.load %[[VAL_33]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_34]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK:         }
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 2fa4470bb8300..af6d254cfd3c3 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -249,24 +249,6 @@ llvm.func @target_is_device_ptr(%x : !llvm.ptr) {
 
 // -----
 
-omp.private {type = firstprivate} @x.privatizer : i32 copy {
-^bb0(%mold: !llvm.ptr, %private: !llvm.ptr):
-  %0 = llvm.load %mold : !llvm.ptr -> i32
-  llvm.store %0, %private : i32, !llvm.ptr
-  omp.yield(%private: !llvm.ptr)
-}
-llvm.func @target_firstprivate(%x : !llvm.ptr) {
-  %0 = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr
-  // expected-error@below {{not yet implemented: Unhandled clause privatization for deferred target tasks in omp.target operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.target}}
-  omp.target nowait map_entries(%0 -> %blockarg0 : !llvm.ptr) private(@x.privatizer %x -> %arg0 [map_idx=0] : !llvm.ptr) {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @target_enter_data_depend(%x: !llvm.ptr) {
   // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target_enter_data}}