diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 3ba77c9a8dc90b..b9b9df35cdb040 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -941,6 +941,11 @@ class TargetTransformInfo { /// applies when shouldMaximizeVectorBandwidth returns true. unsigned getMinimumVF(unsigned ElemWidth) const; + /// \return The maximum vectorization factor for types of given element + /// bit width and opcode, or 0 if there is no maximum VF. + /// Currently only used by the SLP vectorizer. + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; + /// \return True if it should be considered for address type promotion. /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is /// profitable without finding other extensions fed by the same input. @@ -1498,6 +1503,7 @@ class TargetTransformInfo::Concept { virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; + virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; virtual bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0; virtual unsigned getCacheLineSize() const = 0; @@ -1917,6 +1923,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMinimumVF(unsigned ElemWidth) const override { return Impl.getMinimumVF(ElemWidth); } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override { + return Impl.getMaximumVF(ElemWidth, Opcode); + } bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override { return Impl.shouldConsiderAddressTypePromotion( diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index b4847844cd0ee8..2c206094ac4a53 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -356,6 +356,8 @@ class TargetTransformInfoImplBase { unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; } + bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index f327d0cad426db..086a212ee65b70 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -635,6 +635,11 @@ unsigned TargetTransformInfo::getMinimumVF(unsigned ElemWidth) const { return TTIImpl->getMinimumVF(ElemWidth); } +unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth, + unsigned Opcode) const { + return TTIImpl->getMaximumVF(ElemWidth, Opcode); +} + bool TargetTransformInfo::shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { return TTIImpl->shouldConsiderAddressTypePromotion( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index ab83419183ee7f..2cfb0299eea9c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -288,6 +288,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { return 32; } +unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + if (Opcode == Instruction::Load || Opcode == Instruction::Store) + return 32 * 4 / ElemWidth; + return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1; +} + unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 05a054ac30ff2e..1785a2df2eba67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -170,6 +170,7 @@ class GCNTTIImpl final : public BasicTTIImplBase { unsigned getNumberOfRegisters(unsigned RCID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e1c1c6edf08cd1..cda43521e9ba1e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -126,6 +126,10 @@ static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt +MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, + cl::desc("Maximum SLP vectorization factor (0=unlimited)")); + static cl::opt MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, cl::desc("Maximum depth of the lookup for consecutive stores.")); @@ -741,6 +745,12 @@ class BoUpSLP { return MinVecRegSize; } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + unsigned MaxVF = MaxVFOption.getNumOccurrences() ? + MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); + return MaxVF ? MaxVF : UINT_MAX; + } + /// Check if homogeneous aggregate is isomorphic to some VectorType. /// Accepts homogeneous multidimensional aggregate of scalars/vectors like /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, @@ -6191,6 +6201,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); + MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) @@ -7633,7 +7644,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector Incoming; SmallPtrSet VisitedInstrs; - unsigned MaxVecRegSize = R.getMaxVecRegSize(); bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -7660,27 +7670,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Look for the next elements with the same type. SmallVector::iterator SameTypeIt = IncIt; - Type *EltTy = (*IncIt)->getType(); - - assert(EltTy->isSized() && - "Instructions should all be sized at this point"); - TypeSize EltTS = DL->getTypeSizeInBits(EltTy); - if (EltTS.isScalable()) { - // For now, just ignore vectorizing scalable types. - ++IncIt; - continue; - } - - unsigned EltSize = EltTS.getFixedSize(); - unsigned MaxNumElts = MaxVecRegSize / EltSize; - if (MaxNumElts < 2) { - ++IncIt; - continue; - } - while (SameTypeIt != E && - (*SameTypeIt)->getType() == EltTy && - static_cast(SameTypeIt - IncIt) < MaxNumElts) { + (*SameTypeIt)->getType() == (*IncIt)->getType()) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 514cf074908601..c7985b90f012fd 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -123,12 +123,18 @@ bb: ret <2 x i16> %ins.1 } -; FIXME: Should not vectorize define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @uadd_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -145,8 +151,15 @@ bb: define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @usub_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -163,8 +176,15 @@ bb: define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @sadd_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -181,8 +201,15 @@ bb: define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) { ; GCN-LABEL: @ssub_sat_v2i32( ; GCN-NEXT: bb: -; GCN-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[ARG0:%.*]], <2 x i32> [[ARG1:%.*]]) -; GCN-NEXT: ret <2 x i32> [[TMP0]] +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i32> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i32> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i32> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i32> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i32> %arg0, i64 0 @@ -267,8 +294,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <4 x i16> [[TMP0]] +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> +; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) +; GFX8-NEXT: [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> +; GFX8-NEXT: ret <4 x i16> [[INS_3]] ; bb: %arg0.0 = extractelement <4 x i16> %arg0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll index 84d25fe79a27c6..7a8e08de4138b7 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll @@ -18,9 +18,9 @@ bb: ret <2 x half> %tmp5 } -; TODO: Should probably not really be vectorizing this ; GCN-LABEL: @round_v2f32( -; GCN: call <2 x float> @llvm.round.v2f32 +; GCN: call float @llvm.round.f32( +; GCN: call float @llvm.round.f32( define <2 x float> @round_v2f32(<2 x float> %arg) { bb: %tmp = extractelement <2 x float> %arg, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll index 466e83d0260afb..ce33efc951fcc4 100644 --- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -1,7 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=32 < %s | FileCheck -check-prefix=MAX32 %s -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=256 < %s | FileCheck -check-prefix=MAX256 %s -; RUN: opt -slp-vectorizer -S -slp-max-reg-size=1024 < %s | FileCheck -check-prefix=MAX1024 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=1 < %s | FileCheck -check-prefix=MAX32 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=8 < %s | FileCheck -check-prefix=MAX256 %s +; RUN: opt -slp-vectorizer -S -slp-max-vf=32 < %s | FileCheck -check-prefix=MAX1024 %s +; RUN: opt -slp-vectorizer -S < %s | FileCheck -check-prefix=MAX1024 %s + +; Make sure we do not vectorize to create PHI wider than requested. +; On AMDGPU target wider vectorization will result in a higher register pressure, +; spilling, or even inability to allocate registers. define void @phi_float32(half %hval, float %fval) { ; MAX32-LABEL: @phi_float32( @@ -120,6 +125,7 @@ define void @phi_float32(half %hval, float %fval) { ; MAX32-NEXT: [[PHI30:%.*]] = phi float [ [[I63]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[FVAL]], [[BB1]] ] ; MAX32-NEXT: [[PHI31:%.*]] = phi float [ [[I65]], [[BB3]] ], [ [[FVAL]], [[BB4]] ], [ [[I65]], [[BB5]] ], [ [[I65]], [[BB1]] ] ; MAX32-NEXT: [[PHI32:%.*]] = phi float [ [[I67]], [[BB3]] ], [ [[I67]], [[BB4]] ], [ [[FVAL]], [[BB5]] ], [ [[I67]], [[BB1]] ] +; MAX32-NEXT: store float [[PHI31]], float* undef, align 4 ; MAX32-NEXT: ret void ; ; MAX256-LABEL: @phi_float32( @@ -296,6 +302,8 @@ define void @phi_float32(half %hval, float %fval) { ; MAX256-NEXT: [[TMP154:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[TMP93]], [[BB4]] ], [ [[TMP132]], [[BB5]] ], [ [[TMP54]], [[BB1]] ] ; MAX256-NEXT: [[TMP155:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[TMP103]], [[BB4]] ], [ [[TMP142]], [[BB5]] ], [ [[TMP64]], [[BB1]] ] ; MAX256-NEXT: [[TMP156:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[TMP113]], [[BB4]] ], [ [[TMP152]], [[BB5]] ], [ [[TMP74]], [[BB1]] ] +; MAX256-NEXT: [[TMP157:%.*]] = extractelement <8 x float> [[TMP156]], i32 6 +; MAX256-NEXT: store float [[TMP157]], float* undef, align 4 ; MAX256-NEXT: ret void ; ; MAX1024-LABEL: @phi_float32( @@ -481,6 +489,8 @@ define void @phi_float32(half %hval, float %fval) { ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb2: ; MAX1024-NEXT: [[TMP165:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[TMP125]], [[BB4]] ], [ [[TMP164]], [[BB5]] ], [ [[TMP86]], [[BB1]] ] +; MAX1024-NEXT: [[TMP166:%.*]] = extractelement <32 x float> [[TMP165]], i32 30 +; MAX1024-NEXT: store float [[TMP166]], float* undef, align 4 ; MAX1024-NEXT: ret void ; bb: @@ -603,5 +613,6 @@ bb2: %phi30 = phi float [ %i63, %bb3 ], [ %fval, %bb4 ], [ %fval, %bb5 ], [ %fval, %bb1 ] %phi31 = phi float [ %i65, %bb3 ], [ %fval, %bb4 ], [ %i65, %bb5 ], [ %i65, %bb1 ] %phi32 = phi float [ %i67, %bb3 ], [ %i67, %bb4 ], [ %fval, %bb5 ], [ %i67, %bb1 ] + store float %phi31, float* undef ret void }