Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NFC][MLIR][OpenMP] Add test for lowering omp target parallel #70795

Merged

Conversation

DominikAdamski
Copy link
Contributor

Added MLIR test which checks if MLIR sample code with omp target parallel construct is correctly lowered to LLVM IR for the device.

This PR depends on: #67000

@llvmbot
Copy link
Collaborator

llvmbot commented Oct 31, 2023

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-llvm

Author: Dominik Adamski (DominikAdamski)

Changes

Added MLIR test which checks if MLIR sample code with omp target parallel construct is correctly lowered to LLVM IR for the device.

This PR depends on: #67000


Full diff: https://github.com/llvm/llvm-project/pull/70795.diff

1 Files Affected:

  • (added) mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir (+78)
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
new file mode 100644
index 000000000000000..8d321dab33ccdf6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// The aim of the test is to check the LLVM IR codegen for the device
+// for omp target parallel construct
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<3>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<1>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<2>, dense<32> : vector<4xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<8>, dense<128> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<6>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<7>, dense<[160, 256, 256, 32]> : vector<4xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<4>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<5>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 32 : i32>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<debug_kind = 1, openmp_device_version = 11>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target = #omp.target<target_cpu = "gfx90a", target_features = "">, omp.version = #omp.version<version = 11>} {
+  llvm.func @_QQmain_omp_outline_1(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, omp.outline_parent_name = "_QQmain"} {
+    %0 = omp.map_info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = "d"}
+    omp.target map_entries(%0 : !llvm.ptr) {
+      omp.parallel {
+        %1 = llvm.mlir.constant(1 : i32) : i32
+        llvm.store %1, %arg0 : i32, !llvm.ptr
+        omp.terminator
+      }
+    omp.terminator
+    }
+  llvm.return
+  }
+}
+
+// CHECK: define weak_odr protected amdgpu_kernel void [[FUNC0:@.*]](
+// CHECK-SAME: ptr [[TMP0:%.*]]) {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+// CHECK-NEXT:    [[STRUCTARG:%.*]] = alloca { ptr }, align 8, addrspace(5)
+// CHECK-NEXT:    [[STRUCTARG_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STRUCTARG]] to ptr
+// CHECK-NEXT:    [[TMP3:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[TMP3]] to ptr
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) [[KERNEL_ENV:@.*]] to ptr))
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP5]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK-NEXT:    br label [[OMP_TARGET:%.*]]
+// CHECK:       omp.target:
+// CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
+// CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+// CHECK:       omp_parallel:
+// CHECK-NEXT:    [[GEP_:%.*]] = getelementptr { ptr }, ptr addrspace(5) [[STRUCTARG]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[TMP6]], ptr addrspace(5) [[GEP_]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK-NEXT:    store ptr [[STRUCTARG_ASCAST]], ptr [[TMP7]], align 8
+// CHECK-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr [[FUNC1:@.*]], ptr null, ptr [[TMP2]], i64 1)
+// CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+// CHECK:       omp.par.outlined.exit:
+// CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+// CHECK:       omp.par.exit.split:
+// CHECK-NEXT:    br label [[OMP_REGION_CONT:%.*]]
+// CHECK:       omp.region.cont:
+// CHECK-NEXT:    call void @__kmpc_target_deinit()
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+
+// CHECK: define internal void [[FUNC1]](
+// CHECK-SAME: ptr noalias noundef [[TID_ADDR_ASCAST:%.*]], ptr noalias noundef [[ZERO_ADDR_ASCAST:%.*]], ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  omp.par.entry:
+// CHECK-NEXT:    [[GEP_:%.*]] = getelementptr { ptr }, ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[LOADGEP_:%.*]] = load ptr, ptr [[GEP_]], align 8
+// CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TID_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[TID_ADDR_LOCAL]], align 4
+// CHECK-NEXT:    [[TID:%.*]] = load i32, ptr addrspace(5) [[TID_ADDR_LOCAL]], align 4
+// CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+// CHECK:       omp.par.region:
+// CHECK-NEXT:    br label [[OMP_PAR_REGION2:%.*]]
+// CHECK:       omp.par.region2:
+// CHECK-NEXT:    store i32 1, ptr [[LOADGEP_]], align 4
+// CHECK-NEXT:    br label [[OMP_REGION_CONT1:%.*]]
+// CHECK:       omp.region.cont1:
+// CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+// CHECK:       omp.par.pre_finalize:
+// CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+// CHECK:       omp.par.outlined.exit.exitStub:
+// CHECK-NEXT:    ret void
+

@kiranchandramohan
Copy link
Contributor

I did not see any changes in OpenMPTranslation code in mlir in #67000. Is the presence of omp.target important or is this only dependent on the device setting?

@DominikAdamski
Copy link
Contributor Author

Yes, omp.target + openmp attributes define that we should generate the code for the device. omp.target for the device is represented by the outer function with __kmpc_target_init and __kmpc_target_deinit function calls.

I wanted to add the test which generates similar code for omp target parallel as Clang does.

Copy link
Contributor

@kiranchandramohan kiranchandramohan left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, test minimally here. Only test the call to @__kmpc_parallel_51 with the correct outlined function, and the surrounding target_init and target_exit.

@DominikAdamski
Copy link
Contributor Author

Also, test minimally here. Only test the call to @__kmpc_parallel_51 with the correct outlined function, and the surrounding target_init and target_exit.

I removed unnecessary checks. The remained checks validate target initialization and passing arguments for kmpc_parallel_51.

Copy link
Contributor

@kiranchandramohan kiranchandramohan left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. LG with minimizing attributes (if possible).

// The aim of the test is to check the LLVM IR codegen for the device
// for omp target parallel construct

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<3>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<1>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<2>, dense<32> : vector<4xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<8>, dense<128> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<6>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<7>, dense<[160, 256, 256, 32]> : vector<4xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<4>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<5>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 32 : i32>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.flags = #omp.flags<debug_kind = 1, openmp_device_version = 11>, omp.is_gpu = true, omp.is_target_device = true, omp.requires = #omp<clause_requires none>, omp.target = #omp.target<target_cpu = "gfx90a", target_features = "">, omp.version = #omp.version<version = 11>} {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please minimize the attributes also if possible.

Added MLIR test which checks if MLIR sample code with omp target
parallel construct is correctly lowered to LLVM IR for the device.
@DominikAdamski DominikAdamski merged commit cee3b5e into llvm:main Nov 7, 2023
3 checks passed
@DominikAdamski DominikAdamski deleted the mlir_omp_target_parallel_test branch November 7, 2023 13:36
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants