Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenMP] Make team reductions less bad #70981

Closed
wants to merge 1 commit into from
Closed

Conversation

jdoerfert
Copy link
Member

This removes an unnuessary early synchronization in the copy function, removes the unused thread id argument, and uses 64 bit operations for the copy through shared memory if the reduction element has at least 64 bit. None of which should impact anything but performance.

This removes an unnuessary early synchronization in the copy function,
removes the unused thread id argument, and uses 64 bit operations for
the copy through shared memory if the reduction element has at least 64
bit. None of which should impact anything but performance.
@llvmbot llvmbot added clang Clang issues not falling into any other category clang:codegen clang:openmp OpenMP related changes to Clang labels Nov 1, 2023
@llvmbot
Copy link
Collaborator

llvmbot commented Nov 1, 2023

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-clang-codegen

Author: Johannes Doerfert (jdoerfert)

Changes

This removes an unnuessary early synchronization in the copy function, removes the unused thread id argument, and uses 64 bit operations for the copy through shared memory if the reduction element has at least 64 bit. None of which should impact anything but performance.


Patch is 190.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70981.diff

7 Files Affected:

  • (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+33-24)
  • (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (+3-3)
  • (modified) clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp (+231-297)
  • (modified) clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp (+58-76)
  • (modified) clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp (+321-396)
  • (modified) clang/test/OpenMP/reduction_implicit_map.cpp (+33-49)
  • (modified) clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (+56-60)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 0ed665e0dfb9722..76aa1688616a81d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -18,9 +18,12 @@
 #include "clang/AST/OpenMPClause.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/Cuda.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/MathExtras.h"
 
 using namespace clang;
@@ -1342,7 +1345,7 @@ void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
   // Build call __kmpc_cancel_barrier(loc, thread_id);
   unsigned Flags = getDefaultFlagsForBarriers(Kind);
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
-                         getThreadID(CGF, Loc)};
+                         llvm::ConstantInt::getNullValue(CGF.Int32Ty)};
 
   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                           CGM.getModule(), OMPRTL___kmpc_barrier),
@@ -1798,7 +1801,6 @@ static void emitReductionListCopy(
 /// void inter_warp_copy_func(void* reduce_data, num_warps)
 ///   shared smem[warp_size];
 ///   For all data entries D in reduce_data:
-///     sync
 ///     If (I am the first lane in each warp)
 ///       Copy my local D to smem[warp_id]
 ///     sync
@@ -1844,21 +1846,27 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
   // for reduced latency, as well as to have a distinct copy for concurrently
   // executing target regions.  The array is declared with common linkage so
   // as to be shared across compilation units.
-  StringRef TransferMediumName =
-      "__openmp_nvptx_data_transfer_temporary_storage";
-  llvm::GlobalVariable *TransferMedium =
-      M.getGlobalVariable(TransferMediumName);
-  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
-  if (!TransferMedium) {
-    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
-        llvm::UndefValue::get(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
-    CGM.addCompilerUsedGlobal(TransferMedium);
-  }
+  auto CreateSharedMem = [&](llvm::IntegerType *ElementTy, StringRef Suffix) {
+    std::string TransferMediumName =
+        "__openmp_nvptx_data_transfer_temporary_storage";
+    TransferMediumName += Suffix;
+    llvm::GlobalVariable *TransferMedium =
+        M.getGlobalVariable(TransferMediumName);
+    unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
+    if (!TransferMedium) {
+      auto *Ty = llvm::ArrayType::get(ElementTy, WarpSize);
+      unsigned SharedAddressSpace =
+          C.getTargetAddressSpace(LangAS::cuda_shared);
+      TransferMedium = new llvm::GlobalVariable(
+          M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
+          llvm::UndefValue::get(Ty), TransferMediumName,
+          /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+          SharedAddressSpace);
+    }
+    return TransferMedium;
+  };
+  llvm::GlobalVariable *TransferMedium4 = CreateSharedMem(CGM.Int32Ty, "_4");
+  llvm::GlobalVariable *TransferMedium8 = CreateSharedMem(CGM.Int64Ty, "_8");
 
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   // Get the CUDA thread id of the current OpenMP thread on the GPU.
@@ -1888,7 +1896,11 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
         C.getTypeSizeInChars(Private->getType())
             .alignTo(C.getTypeAlignInChars(Private->getType()))
             .getQuantity();
-    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
+
+    llvm::GlobalVariable *TransferMedium =
+        RealTySize >= 8 ? TransferMedium8 : TransferMedium4;
+    for (unsigned TySize = RealTySize >= 8 ? 8 : 4;
+         TySize > 0 && RealTySize > 0; TySize /= 2) {
       unsigned NumIters = RealTySize / TySize;
       if (NumIters == 0)
         continue;
@@ -1916,10 +1928,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
         Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
         CGF.EmitBlock(BodyBB);
       }
-      // kmpc_barrier.
-      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
-                                             /*EmitChecks=*/false,
-                                             /*ForceSimpleCall=*/true);
+
       llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
       llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
       llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
@@ -1958,7 +1967,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
           ElemPtr, /*Volatile=*/false, CType, Loc,
           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
       // Store the source element value to the dest element address.
-      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
+      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false, CType,
                             LValueBaseInfo(AlignmentSource::Type),
                             TBAAAccessInfo());
 
@@ -2014,7 +2023,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
 
       // *TargetElemPtr = SrcMediumVal;
       llvm::Value *SrcMediumValue =
-          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
+          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/false, CType, Loc);
       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
                             CType);
       Bld.CreateBr(W0MergeBB);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 141436f26230dde..09e20cf50a95e63 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -44,6 +44,9 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
     DS_Generic,
   };
 
+  /// Get barrier to synchronize all threads in a block.
+  void syncCTAThreads(CodeGenFunction &CGF);
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -56,9 +59,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
 
   DataSharingMode getDataSharingMode() const;
 
-  /// Get barrier to synchronize all threads in a block.
-  void syncCTAThreads(CodeGenFunction &CGF);
-
   /// Helper for target directive initialization.
   void emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF,
                       EntryFunctionState &EST, bool IsSPMD);
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 094c5ae3522f96d..968bd49647e0751 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -225,56 +225,40 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
-// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK-64-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
-// CHECK-64-NEXT:    br label [[PRECOND:%.*]]
-// CHECK-64:       precond:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
-// CHECK-64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
-// CHECK-64-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
-// CHECK-64:       body:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK-64:       then:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
-// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
-// CHECK-64-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_8, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP9]], ptr addrspace(3) [[TMP8]], align 8
 // CHECK-64-NEXT:    br label [[IFCONT:%.*]]
 // CHECK-64:       else:
 // CHECK-64-NEXT:    br label [[IFCONT]]
 // CHECK-64:       ifcont:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
+// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 0)
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP10]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK-64:       then2:
-// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
-// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
-// CHECK-64-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_8, i64 0, i32 [[TMP2]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i64, ptr addrspace(3) [[TMP11]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 8
 // CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK-64:       else3:
 // CHECK-64-NEXT:    br label [[IFCONT4]]
 // CHECK-64:       ifcont4:
-// CHECK-64-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
-// CHECK-64-NEXT:    br label [[PRECOND]]
-// CHECK-64:       exit:
 // CHECK-64-NEXT:    ret void
 //
 //
@@ -451,65 +435,62 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
-// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK-64:       then:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
-// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
-// CHECK-64-NEXT:    store volatile i8 [[TMP10]], ptr addrspace(3) [[TMP9]], align 1
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+// CHECK-64-NEXT:    store i8 [[TMP9]], ptr addrspace(3) [[TMP8]], align 1
 // CHECK-64-NEXT:    br label [[IFCONT:%.*]]
 // CHECK-64:       else:
 // CHECK-64-NEXT:    br label [[IFCONT]]
 // CHECK-64:       ifcont:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
+// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 0)
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP10]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK-64:       then2:
-// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
-// CHECK-64-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[TMP2]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
+// CHECK-64-NEXT:    store i8 [[TMP14]], ptr [[TMP13]], align 1
 // CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK-64:       else3:
 // CHECK-64-NEXT:    br label [[IFCONT4]]
 // CHECK-64:       ifcont4:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
 // CHECK-64:       then6:
-// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
-// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK-64-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
+// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+// CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP18]], ptr addrspace(3) [[TMP17]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT8:%.*]]
 // CHECK-64:       else7:
 // CHECK-64-NEXT:    br label [[IFCONT8]]
 // CHECK-64:       ifcont8:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
+// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 0)
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP2]], [[TMP19]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
 // CHECK-64:       then10:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
-// CHECK-64-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
-// CHECK-64-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
-// CHECK-64-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[TMP2]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(3) [[TMP20]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], ptr [[TMP22]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT12:%.*]]
 // CHECK-64:       else11:
 // CHECK-64-NEXT:    br label [[IFCONT12]]
@@ -707,65 +688,62 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and...
[truncated]

@jdoerfert jdoerfert closed this Nov 2, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
clang:codegen clang:openmp OpenMP related changes to Clang clang Clang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

2 participants