diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 85bcb3c7d0982..4246ec671a0f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -334,86 +334,49 @@ static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) { ArrayTy->getNumElements()); } -static Value *stripBitcasts(Value *V) { - while (Instruction *I = dyn_cast(V)) { - if (I->getOpcode() != Instruction::BitCast) - break; - V = I->getOperand(0); - } - return V; -} - static Value * calculateVectorIndex(Value *Ptr, const std::map &GEPIdx) { - GetElementPtrInst *GEP = dyn_cast(stripBitcasts(Ptr)); + auto *GEP = dyn_cast(Ptr->stripPointerCasts()); if (!GEP) - return nullptr; + return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext())); auto I = GEPIdx.find(GEP); - return I == GEPIdx.end() ? nullptr : I->second; + assert(I != GEPIdx.end() && "Must have entry for GEP!"); + return I->second; } -static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { - // FIXME we only support simple cases - if (GEP->getNumOperands() != 3) +static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, + Type *VecElemTy, const DataLayout &DL) { + // TODO: Extracting a "multiple of X" from a GEP might be a useful generic + // helper. + unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); + MapVector VarOffsets; + APInt ConstOffset(BW, 0); + if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || + !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) return nullptr; - ConstantInt *I0 = dyn_cast(GEP->getOperand(1)); - if (!I0 || !I0->isZero()) + unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy); + if (VarOffsets.size() > 1) return nullptr; - return GEP->getOperand(2); -} - -// Not an instruction handled below to turn into a vector. -// -// TODO: Check isTriviallyVectorizable for calls and handle other -// instructions. -static bool canVectorizeInst(Instruction *Inst, User *User, - const DataLayout &DL) { - switch (Inst->getOpcode()) { - case Instruction::Load: { - // Currently only handle the case where the Pointer Operand is a GEP. - // Also we could not vectorize volatile or atomic loads. - LoadInst *LI = cast(Inst); - if (isa(User) && - LI->getPointerOperandType() == User->getType() && - isa(LI->getType())) - return true; - - Instruction *PtrInst = dyn_cast(LI->getPointerOperand()); - if (!PtrInst) - return false; - - return (PtrInst->getOpcode() == Instruction::GetElementPtr || - PtrInst->getOpcode() == Instruction::BitCast) && - LI->isSimple(); + if (VarOffsets.size() == 1) { + // Only handle cases where we don't need to insert extra arithmetic + // instructions. + const auto &VarOffset = VarOffsets.front(); + if (!ConstOffset.isZero() || VarOffset.second != VecElemSize) + return nullptr; + return VarOffset.first; } - case Instruction::BitCast: - return true; - case Instruction::Store: { - // Must be the stored pointer operand, not a stored value, plus - // since it should be canonical form, the User should be a GEP. - // Also we could not vectorize volatile or atomic stores. - StoreInst *SI = cast(Inst); - if (isa(User) && - SI->getPointerOperandType() == User->getType() && - isa(SI->getValueOperand()->getType())) - return true; - - Instruction *UserInst = dyn_cast(User); - if (!UserInst) - return false; - return (SI->getPointerOperand() == User) && - (UserInst->getOpcode() == Instruction::GetElementPtr || - UserInst->getOpcode() == Instruction::BitCast) && - SI->isSimple(); - } - default: - return false; - } + APInt Quot; + uint64_t Rem; + APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem); + if (Rem != 0) + return nullptr; + + return ConstantInt::get(GEP->getContext(), Quot); } static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, @@ -455,73 +418,87 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, } std::map GEPVectorIdx; - std::vector WorkList; - SmallVector Users(Alloca->users()); - SmallVector UseUsers(Users.size(), Alloca); + SmallVector WorkList; + SmallVector Uses; + for (Use &U : Alloca->uses()) + Uses.push_back(&U); + Type *VecEltTy = VectorTy->getElementType(); - while (!Users.empty()) { - User *AllocaUser = Users.pop_back_val(); - User *UseUser = UseUsers.pop_back_val(); - Instruction *Inst = dyn_cast(AllocaUser); - - GetElementPtrInst *GEP = dyn_cast(AllocaUser); - if (!GEP) { - if (!canVectorizeInst(Inst, UseUser, DL)) + while (!Uses.empty()) { + Use *U = Uses.pop_back_val(); + Instruction *Inst = dyn_cast(U->getUser()); + + if (Value *Ptr = getLoadStorePointerOperand(Inst)) { + // This is a store of the pointer, not to the pointer. + if (isa(Inst) && + U->getOperandNo() != StoreInst::getPointerOperandIndex()) return false; - if (Inst->getOpcode() == Instruction::BitCast) { - Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType(); - Type *ToTy = Inst->getType()->getPointerElementType(); - if (FromTy->isAggregateType() || ToTy->isAggregateType() || - DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy)) - continue; - - for (User *CastUser : Inst->users()) { - if (isAssumeLikeIntrinsic(cast(CastUser))) - continue; - Users.push_back(CastUser); - UseUsers.push_back(Inst); - } + Type *AccessTy = getLoadStoreType(Inst); + Ptr = Ptr->stripPointerCasts(); + // Alloca already accessed as vector, leave alone. + if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) == + DL.getTypeStoreSize(AccessTy)) continue; - } - WorkList.push_back(AllocaUser); + // Check that this is a simple access of a vector element. + bool IsSimple = isa(Inst) ? cast(Inst)->isSimple() + : cast(Inst)->isSimple(); + if (!IsSimple || + !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL)) + return false; + + WorkList.push_back(Inst); continue; } - Value *Index = GEPToVectorIndex(GEP); + if (isa(Inst)) { + // Look through bitcasts. + for (Use &U : Inst->uses()) + Uses.push_back(&U); + continue; + } - // If we can't compute a vector index from this GEP, then we can't - // promote this alloca to vector. - if (!Index) { - LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP - << '\n'); - return false; + if (auto *GEP = dyn_cast(Inst)) { + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL); + if (!Index) { + LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP + << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (Use &U : Inst->uses()) + Uses.push_back(&U); + continue; } - GEPVectorIdx[GEP] = Index; - Users.append(GEP->user_begin(), GEP->user_end()); - UseUsers.append(GEP->getNumUses(), GEP); + // Ignore assume-like intrinsics and comparisons used in assumes. + if (isAssumeLikeIntrinsic(Inst)) + continue; + + if (isa(Inst) && all_of(Inst->users(), [](User *U) { + return isAssumeLikeIntrinsic(cast(U)); + })) + continue; + + // Unknown user. + return false; } LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); - for (Value *V : WorkList) { - Instruction *Inst = cast(V); + for (Instruction *Inst : WorkList) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy()) - break; - Value *Ptr = cast(Inst)->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - if (!Index) - break; - - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); @@ -533,16 +510,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, } case Instruction::Store: { StoreInst *SI = cast(Inst); - if (SI->getValueOperand()->getType() == AllocaTy || - SI->getValueOperand()->getType()->isVectorTy()) - break; - Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - if (!Index) - break; - - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *Elt = SI->getValueOperand(); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll index d9db49e0767d4..3e7b4c99f74b6 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -72,9 +72,15 @@ define amdgpu_vs void @promote_store_aggr() #0 { ; CHECK-NEXT: [[FOO2:%.*]] = load i32, i32* [[I]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float ; CHECK-NEXT: [[FOO4:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 0 -; CHECK-NEXT: store float [[FOO3]], float* [[FOO4]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0 +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP1]], align 8 ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 1 -; CHECK-NEXT: store float 2.000000e+00, float* [[FOO5]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1 +; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float>* [[TMP4]], align 8 ; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], [2 x float]* [[F1]], align 4 ; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1 ; CHECK-NEXT: store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4 @@ -116,13 +122,15 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 { ; CHECK-NEXT: store [2 x float] [[FOO3]], [2 x float]* [[F1]], align 4 ; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32* [[I]], align 4 ; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[FOO6:%.*]] = load float, float* [[FOO5]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]] ; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16 ; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float>* [[FOO7]], align 16 -; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0 -; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1 -; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2 -; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3 +; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0 +; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3 ; CHECK-NEXT: [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0 ; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16 ; CHECK-NEXT: ret void @@ -163,17 +171,28 @@ define amdgpu_ps void @promote_double_aggr() #0 { ; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1 ; CHECK-NEXT: store [2 x double] [[FOO5]], [2 x double]* [[S]], align 8 ; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[FOO7:%.*]] = load double, double* [[FOO6]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1 ; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[FOO9:%.*]] = load double, double* [[FOO8]], align 8 -; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO7]], [[FOO9]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]] ; CHECK-NEXT: [[FOO11:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0 -; CHECK-NEXT: store double [[FOO10]], double* [[FOO11]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0 +; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP7]], align 16 ; CHECK-NEXT: [[FOO12:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0 -; CHECK-NEXT: [[FOO13:%.*]] = load double, double* [[FOO12]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[TMP10]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 ; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[FOO15:%.*]] = load double, double* [[FOO14]], align 8 -; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO13]], [[FOO15]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>* +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1 +; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]] ; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float ; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0 ; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll index 393e80f6be132..fe4355f691967 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -281,14 +281,8 @@ bb13: ; preds = %.preheader ; TODO: llvm.assume can be ingored ; OPT-LABEL: @vector_read_alloca_bitcast_assume( -; OPT: %tmp = alloca <4 x i32>, align 16, addrspace(5) -; OPT-NEXT: %x = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %tmp, i64 0, i64 0 -; OPT-NEXT: store i32 0, i32 addrspace(5)* %x, align 16 -; OPT-NEXT: %0 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp, align 16 -; OPT-NEXT: %1 = shufflevector <4 x i32> %0, <4 x i32> , <4 x i32> -; OPT-NEXT: store <4 x i32> %1, <4 x i32> addrspace(5)* %tmp, align 16 -; OPT-NEXT: %2 = extractelement <4 x i32> %1, i32 %index -; OPT-NEXT: store i32 %2, i32 addrspace(1)* %out, align 4 +; OPT: %0 = extractelement <4 x i32> , i32 %index +; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume: ; GCN-COUNT-4: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll index ad893f12f226f..ddcc53ce85569 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll @@ -113,16 +113,9 @@ entry: ret void } -; FIXME: Should be able to promote this. Instcombine should fold the -; cast in the hasOneUse case so it might not matter in practice - ; OPT-LABEL: @vector_read_bitcast_alloca( -; OPT: alloca [4 x float] -; OPT: store float -; OPT: store float -; OPT: store float -; OPT: store float -; OPT: load float +; OPT: %0 = extractelement <4 x float> , i32 %index +; OPT: store float %0, float addrspace(1)* %out, align 4 define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5)