diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu index d483803005074f..01e0d3db461273 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -18,7 +18,7 @@ // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce) // CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* -// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4 +// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4 // OPT: ret void @@ -30,7 +30,7 @@ __global__ void kernel1(int *x) { // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce) // CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* -// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4 +// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4 // OPT: ret void @@ -68,7 +68,8 @@ struct S { // OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)* -// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[G0]], align 4 +// OPT: [[G2:%.*]] ={{.*}} addrspacecast i32* [[P0]] to i32 addrspace(4)* +// OPT: [[V0:%.*]] = load i32, i32 addrspace(4)* [[G2]], align 4 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1 // OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4 @@ -103,7 +104,8 @@ struct T { // OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)* -// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[G0]], align 4 +// OPT: [[G2:%.*]] ={{.*}} addrspacecast float* [[P0]] to float addrspace(4)* +// OPT: [[V0:%.*]] = load float, float addrspace(4)* [[G2]], align 4 // OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00 // OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4 @@ -130,7 +132,7 @@ struct SS { // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(float addrspace(1)*{{.*}} %a.coerce) // CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* -// OPT: [[VAL:%.*]] = load float, float addrspace(1)* %a.coerce, align 4 +// OPT: [[VAL:%.*]] = load float, float addrspace(4)* %a.coerce.const, align 4 // OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00 // OPT: store float [[INC]], float addrspace(1)* %a.coerce, align 4 // OPT: ret void diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp index b9b48290dd277e..65ad8b2aeacd3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp @@ -42,6 +42,8 @@ class AMDGPUPromoteKernelArguments : public FunctionPass { bool promotePointer(Value *Ptr); + bool promoteLoad(LoadInst *LI); + public: static char ID; @@ -73,16 +75,10 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) { break; case Instruction::Load: { LoadInst *LD = cast(U); - PointerType *PT = dyn_cast(LD->getType()); - if (!PT || - (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS && - PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS && - PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) || - LD->getPointerOperand()->stripInBoundsOffsets() != Ptr) - break; - // TODO: This load poprobably can be promoted to constant address space. - if (!AMDGPU::isClobberedInFunction(LD, MSSA, AA)) + if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr && + !AMDGPU::isClobberedInFunction(LD, MSSA, AA)) Ptrs.push_back(LD); + break; } case Instruction::GetElementPtr: @@ -96,15 +92,26 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) { } bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { - enqueueUsers(Ptr); + bool Changed = false; + + LoadInst *LI = dyn_cast(Ptr); + if (LI) + Changed |= promoteLoad(LI); + + PointerType *PT = dyn_cast(Ptr->getType()); + if (!PT) + return Changed; + + if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + enqueueUsers(Ptr); - PointerType *PT = cast(Ptr->getType()); if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) - return false; + return Changed; - bool IsArg = isa(Ptr); - IRBuilder<> B(IsArg ? ArgCastInsertPt - : &*std::next(cast(Ptr)->getIterator())); + IRBuilder<> B(LI ? &*std::next(cast(Ptr)->getIterator()) + : ArgCastInsertPt); // Cast pointer to global address space and back to flat and let // Infer Address Spaces pass to do all necessary rewriting. @@ -120,6 +127,38 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { return true; } +bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) { + if (!LI->isSimple()) + return false; + + Value *Ptr = LI->getPointerOperand(); + + // Strip casts we have created earlier. + Value *OrigPtr = Ptr; + PointerType *PT; + for ( ; ; ) { + PT = cast(OrigPtr->getType()); + if (PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + return false; + auto *P = dyn_cast(OrigPtr); + if (!P) + break; + auto *NewPtr = P->getPointerOperand(); + if (!cast(NewPtr->getType())->hasSameElementTypeAs(PT)) + break; + OrigPtr = NewPtr; + } + + IRBuilder<> B(LI); + + PointerType *NewPT = + PointerType::getWithSamePointeeType(PT, AMDGPUAS::CONSTANT_ADDRESS); + Value *Cast = B.CreateAddrSpaceCast(OrigPtr, NewPT, + Twine(OrigPtr->getName(), ".const")); + LI->replaceUsesOfWith(Ptr, Cast); + return true; +} + // skip allocas static BasicBlock::iterator getInsertPt(BasicBlock &BB) { BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll index 82ca6a8b3f6448..5cc37b45e0cc4c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll @@ -11,11 +11,15 @@ define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly % ; CHECK-NEXT: entry: ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]] -; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8 -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* -; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8 -; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* -; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 +; CHECK-NEXT: [[P1_CONST:%.*]] = addrspacecast float** addrspace(1)* [[P1]] to float** addrspace(4)* +; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(4)* [[P1_CONST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float** +; CHECK-NEXT: [[P2_FLAT:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float** +; CHECK-NEXT: [[P2_CONST:%.*]] = addrspacecast float** [[TMP1]] to float* addrspace(4)* +; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(4)* [[P2_CONST]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[TMP2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -37,9 +41,11 @@ define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) { ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]] ; CHECK-NEXT: [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)* -; CHECK-NEXT: [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8 -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)* -; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast i32* addrspace(1)* [[P1_CAST]] to i32** +; CHECK-NEXT: [[P1_CAST_CONST:%.*]] = addrspacecast i32** [[TMP0]] to i32* addrspace(4)* +; CHECK-NEXT: [[P2:%.*]] = load i32*, i32* addrspace(4)* [[P1_CAST_CONST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)* +; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -60,10 +66,11 @@ define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture reado ; CHECK-LABEL: @ptr_in_struct( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8 -; CHECK-NEXT: [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)* +; CHECK-NEXT: [[P_CONST:%.*]] = addrspacecast float* addrspace(1)* [[P]] to float* addrspace(4)* +; CHECK-NEXT: [[P1:%.*]] = load float*, float* addrspace(4)* [[P_CONST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)* ; CHECK-NEXT: [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i32 [[ID]] ; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4 ; CHECK-NEXT: ret void ; @@ -80,7 +87,14 @@ entry: ; GCN-LABEL: flat_ptr_arg: ; GCN-COUNT-2: global_load_dwordx2 -; GCN: global_load_dwordx4 + +; FIXME: First load is in the constant address space and second is in global +; because it is clobbered by store. GPU load store vectorizer cannot +; combine them. Note, this does not happen with -O3 because loads are +; vectorized in pairs earlier and stay in the global address space. + +; GCN: global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}} +; GCN: global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4 ; GCN: global_store_dword define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) { ; CHECK-LABEL: @flat_ptr_arg( @@ -90,22 +104,26 @@ define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 -; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* -; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX10]] to float** +; CHECK-NEXT: [[ARRAYIDX10_CONST:%.*]] = addrspacecast float** [[TMP0]] to float* addrspace(4)* +; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX10_CONST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast float addrspace(1)* [[TMP1]] to float* +; CHECK-NEXT: [[I1_CONST:%.*]] = addrspacecast float* [[TMP2]] to float addrspace(4)* +; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(4)* [[I1_CONST]], align 4 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]] ; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP1]], i64 1 ; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 ; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]] ; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP1]], i64 2 ; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 ; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]] ; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP1]], i64 3 ; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 ; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]] @@ -114,10 +132,12 @@ define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] ; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8 -; CHECK-NEXT: [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)* +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX11]] to float** +; CHECK-NEXT: [[ARRAYIDX11_CONST:%.*]] = addrspacecast float** [[TMP3]] to float* addrspace(4)* +; CHECK-NEXT: [[I7:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX11_CONST]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)* ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP4]], i64 [[IDXPROM8]] ; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: ret void ; @@ -157,7 +177,8 @@ entry: ; GCN-LABEL: global_ptr_arg: ; GCN: global_load_dwordx2 -; GCN: global_load_dwordx4 +; GCN: global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}} +; GCN: global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4 ; GCN: global_store_dword define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { ; CHECK-LABEL: @global_ptr_arg( @@ -165,22 +186,25 @@ define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonl ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 -; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* -; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4 +; CHECK-NEXT: [[ARRAYIDX10_CONST:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX10]] to float* addrspace(4)* +; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX10_CONST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast float addrspace(1)* [[TMP0]] to float* +; CHECK-NEXT: [[I1_CONST:%.*]] = addrspacecast float* [[TMP1]] to float addrspace(4)* +; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(4)* [[I1_CONST]], align 4 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]] ; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 1 ; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 ; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]] ; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 2 ; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 ; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]] ; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 3 ; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 ; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]] @@ -189,7 +213,7 @@ define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonl ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] ; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 [[IDXPROM8]] ; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: ret void ; @@ -280,18 +304,19 @@ define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace( ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 -; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* +; CHECK-NEXT: [[ARRAYIDX10_CONST:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX10]] to float* addrspace(4)* +; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX10_CONST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]] ; CHECK-NEXT: store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4 +; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]] ; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 ; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] ; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 ; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 [[IDXPROM8]] ; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 ; CHECK-NEXT: ret void ; @@ -323,11 +348,15 @@ define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture re ; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]] ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8 -; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* -; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8 -; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* -; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 +; CHECK-NEXT: [[P1_CONST:%.*]] = addrspacecast float** addrspace(1)* [[P1]] to float** addrspace(4)* +; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(4)* [[P1_CONST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float** +; CHECK-NEXT: [[P2_FLAT:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float** +; CHECK-NEXT: [[P2_CONST:%.*]] = addrspacecast float** [[TMP1]] to float* addrspace(4)* +; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(4)* [[P2_CONST]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[TMP2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -340,5 +369,150 @@ entry: ret void } +; GCN-LABEL: flat_ptr_nest_2: +; GCN: s_lshl_b64 +; GCN: s_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @flat_ptr_nest_2(float** nocapture readonly %Arg, i32 %i) { +; CHECK-LABEL: @flat_ptr_nest_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[P1]] to float** +; CHECK-NEXT: [[P1_CONST:%.*]] = addrspacecast float** [[TMP0]] to float* addrspace(4)* +; CHECK-NEXT: [[P2:%.*]] = load float*, float* addrspace(4)* [[P1_CONST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[TMP1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds float*, float** %Arg, i32 %i + %p2 = load float*, float** %p1, align 8 + store float 0.000000e+00, float* %p2, align 4 + ret void +} + +; GCN-LABEL: const_ptr_nest_3: +; GCN: s_lshl_b64 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) { +; CHECK-LABEL: @const_ptr_nest_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]] +; CHECK-NEXT: [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8 +; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[TMP0]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i + %p2 = load float* addrspace(4)*, float * addrspace(4)* addrspace(4)* %p1, align 8 + %p3 = load float*, float* addrspace(4)* %p2, align 8 + store float 0.000000e+00, float* %p3, align 4 + ret void +} + +; GCN-LABEL: cast_from_const_const_ptr_nest_3: +; GCN: s_lshl_b64 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) { +; CHECK-LABEL: @cast_from_const_const_ptr_nest_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]] +; CHECK-NEXT: [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8 +; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8 +; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i + %a1 = addrspacecast float* addrspace(4)* addrspace(4)* %p1 to float* addrspace(4)** + %p2 = load float* addrspace(4)*, float* addrspace(4)** %a1, align 8 + %a2 = addrspacecast float* addrspace(4)* %p2 to float** + %p3 = load float*, float** %a2, align 8 + store float 0.000000e+00, float* %p3, align 4 + ret void +} + +; GCN-LABEL: flat_ptr_volatile_load: +; GCN: s_lshl_b64 +; GCN: flat_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @flat_ptr_volatile_load(float** nocapture readonly %Arg, i32 %i) { +; CHECK-LABEL: @flat_ptr_volatile_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[P1]] to float** +; CHECK-NEXT: [[P2:%.*]] = load volatile float*, float** [[TMP0]], align 8 +; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds float*, float** %Arg, i32 %i + %p2 = load volatile float*, float** %p1, align 8 + store float 0.000000e+00, float* %p2, align 4 + ret void +} + +; GCN-LABEL: flat_ptr_atomic_load: +; GCN: s_lshl_b64 +; GCN: global_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @flat_ptr_atomic_load(float** nocapture readonly %Arg, i32 %i) { +; CHECK-LABEL: @flat_ptr_atomic_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]] +; CHECK-NEXT: [[P2:%.*]] = load atomic float*, float* addrspace(1)* [[P1]] monotonic, align 8 +; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds float*, float** %Arg, i32 %i + %p2 = load atomic float*, float** %p1 monotonic, align 8 + store float 0.000000e+00, float* %p2, align 4 + ret void +} + +; GCN-LABEL: cast_changing_pointee_type: +; GCN: s_lshl_b64 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @cast_changing_pointee_type(float* addrspace(1)* addrspace(1)* nocapture readonly %Arg, i32 %i) { +; CHECK-LABEL: @cast_changing_pointee_type( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* [[ARG:%.*]], i32 [[I:%.*]] +; CHECK-NEXT: [[A1:%.*]] = bitcast float* addrspace(1)* addrspace(1)* [[P1]] to i32* addrspace(1)* addrspace(1)* +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast float* addrspace(1)* addrspace(1)* [[P1]] to i32* addrspace(1)** +; CHECK-NEXT: [[A1_CONST:%.*]] = addrspacecast i32* addrspace(1)** [[TMP0]] to i32* addrspace(1)* addrspace(4)* +; CHECK-NEXT: [[P2:%.*]] = load i32* addrspace(1)*, i32* addrspace(1)* addrspace(4)* [[A1_CONST]], align 8 +; CHECK-NEXT: [[A2:%.*]] = bitcast i32* addrspace(1)* [[P2]] to float* addrspace(1)* +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast i32* addrspace(1)* [[P2]] to float** +; CHECK-NEXT: [[A2_CONST:%.*]] = addrspacecast float** [[TMP1]] to float* addrspace(4)* +; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(4)* [[A2_CONST]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* %Arg, i32 %i + %a1 = addrspacecast float* addrspace(1)* addrspace(1)* %p1 to i32* addrspace(1)** + %p2 = load i32* addrspace(1)*, i32* addrspace(1)** %a1, align 8 + %a2 = addrspacecast i32* addrspace(1)* %p2 to float** + %p3 = load float*, float** %a2, align 8 + store float 0.000000e+00, float* %p3, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() declare void @llvm.amdgcn.s.barrier()