diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 09f7877b13b3a..f37f8d991160d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -313,6 +313,24 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { return !F || !ST->isSingleLaneExecution(*F); } +unsigned GCNTTIImpl::getRegUsageForType(Type *Ty) { + if (auto *VT = dyn_cast(Ty)) { + if (auto *PT = dyn_cast(VT->getElementType())) { + switch (PT->getAddressSpace()) { + // Assume that the resource parts of the vector being asked about are the + // same. + case AMDGPUAS::BUFFER_FAT_POINTER: + return 4 + VT->getNumElements(); + case AMDGPUAS::BUFFER_STRIDED_POINTER: + return 4 + 2 * VT->getNumElements(); + default: + break; + } + } + } + return BaseT::getRegUsageForType(Ty); +} + unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector // registers. See getRegisterClassForType for the implementation. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index a0d62008d9ddc..a8fce0a78e565 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -113,6 +113,14 @@ class GCNTTIImpl final : public BasicTTIImplBase { void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); + // Vectorization will query for the number of registers needed for + // and the default implementation will cause crashes, + // so override it here. This also lets us account for the fact that, in the + // context of loop vectorization (which is what uses this API), the number of + // registers needed for fat pointers is lower because they'll share a resource + // part. + unsigned getRegUsageForType(Type *Ty); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); return TTI::PSK_FastHardware; diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll new file mode 100644 index 0000000000000..3abbe13483e03 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=loop-vectorize -S < %s | FileCheck %s + +; Reduced from a crash, variables added to make things more realistic. +; This is a roundabout test for TargetLowering::getValueType() returning +; a reasonable value for instead of asserting. +define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %v) { +; CHECK-LABEL: define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32( +; CHECK-SAME: ptr addrspace(1) [[DOTPTR:%.*]], i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[_LR_PH5:.*:]] +; CHECK-NEXT: [[DOTRSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[DOTPTR]], i16 0, i32 -2147483648, i32 159744) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(8) [[DOTRSRC]] to ptr addrspace(7) +; CHECK-NEXT: br label %[[BB2:.*]] +; CHECK: [[BB2]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[DOTLR_PH5:%.*]] ], [ [[TMP5:%.*]], %[[BB2]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP3]], [[TMP0]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], [[DOT_CRIT_EDGE_LOOPEXIT:label %.*]], label %[[BB2]] +; CHECK: [[__CRIT_EDGE_LOOPEXIT:.*:]] +; CHECK-NEXT: ret void +; +entry: + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744) + %fat = addrspacecast ptr addrspace(8) %rsrc to ptr addrspace(7) + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %ptr = getelementptr i32, ptr addrspace(7) %fat, i32 0 + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv, %v + br i1 %exitcond.not, label %exit, label %loop + +exit: ; preds = %exit + ret void +} + +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32)